diff --git a/CryptoPkg/CryptoPkg.ci.yaml b/CryptoPkg/CryptoPkg.ci.yaml index b601bcf85ce1..8bb5717a1286 100644 --- a/CryptoPkg/CryptoPkg.ci.yaml +++ b/CryptoPkg/CryptoPkg.ci.yaml @@ -105,17 +105,21 @@ # options defined in .pytool/Plugin/UncrustifyCheck "UncrustifyCheck": { "IgnoreFiles": [ + "Library/OpensslLib/OpensslGen/crypto/params_idx.c", "Library/OpensslLib/OpensslGen/include/crypto/bn_conf.h", "Library/OpensslLib/OpensslGen/include/crypto/dso_conf.h", + "Library/OpensslLib/OpensslGen/include/internal/param_names.h", "Library/OpensslLib/OpensslGen/include/openssl/asn1.h", "Library/OpensslLib/OpensslGen/include/openssl/asn1t.h", "Library/OpensslLib/OpensslGen/include/openssl/bio.h", "Library/OpensslLib/OpensslGen/include/openssl/cmp.h", "Library/OpensslLib/OpensslGen/include/openssl/cms.h", + "Library/OpensslLib/OpensslGen/include/openssl/comp.h", "Library/OpensslLib/OpensslGen/include/openssl/conf.h", "Library/OpensslLib/OpensslGen/include/openssl/configuration-ec.h", "Library/OpensslLib/OpensslGen/include/openssl/configuration-noec.h", "Library/OpensslLib/OpensslGen/include/openssl/configuration.h", + "Library/OpensslLib/OpensslGen/include/openssl/core_names.h", "Library/OpensslLib/OpensslGen/include/openssl/crmf.h", "Library/OpensslLib/OpensslGen/include/openssl/crypto.h", "Library/OpensslLib/OpensslGen/include/openssl/ct.h", @@ -133,6 +137,7 @@ "Library/OpensslLib/OpensslGen/include/openssl/ui.h", "Library/OpensslLib/OpensslGen/include/openssl/x509.h", "Library/OpensslLib/OpensslGen/include/openssl/x509v3.h", + "Library/OpensslLib/OpensslGen/include/openssl/x509_acert.h", "Library/OpensslLib/OpensslGen/include/openssl/x509_vfy.h", "Library/OpensslLib/OpensslGen/providers/common/der/der_digests_gen.c", "Library/OpensslLib/OpensslGen/providers/common/der/der_ecx_gen.c", diff --git a/CryptoPkg/CryptoPkg.dec b/CryptoPkg/CryptoPkg.dec index dc521c09a805..2cc57a0e928c 100644 --- a/CryptoPkg/CryptoPkg.dec +++ b/CryptoPkg/CryptoPkg.dec @@ -25,6 +25,7 @@ Library/OpensslLib/openssl Library/OpensslLib/openssl/include Library/OpensslLib/openssl/providers/common/include + Library/OpensslLib/openssl/providers/fips/include Library/OpensslLib/openssl/providers/implementations/include Library/OpensslLib/OpensslGen/include Library/OpensslLib/OpensslGen/providers/common/include diff --git a/CryptoPkg/CryptoPkg.dsc b/CryptoPkg/CryptoPkg.dsc index 8492f91b12ed..26ad4c459960 100644 --- a/CryptoPkg/CryptoPkg.dsc +++ b/CryptoPkg/CryptoPkg.dsc @@ -477,6 +477,7 @@ MSFT:*_*_IA32_DLINK_FLAGS = /ALIGN:4096 MSFT:*_*_X64_DLINK_FLAGS = /ALIGN:4096 + GCC:*_*_AARCH64_DLINK_XIPFLAGS = -z common-page-size=0x1000 } !endif @@ -543,6 +544,7 @@ MSFT:*_*_IA32_DLINK_FLAGS = /ALIGN:4096 MSFT:*_*_X64_DLINK_FLAGS = /ALIGN:4096 + GCC:*_*_AARCH64_DLINK_XIPFLAGS = -z common-page-size=0x1000 } # # CryptoSmm with OpensslLib instance with no SSL or EC services @@ -596,6 +598,7 @@ MSFT:*_*_IA32_DLINK_FLAGS = /ALIGN:4096 MSFT:*_*_X64_DLINK_FLAGS = /ALIGN:4096 + GCC:*_*_AARCH64_DLINK_XIPFLAGS = -z common-page-size=0x1000 } # # CryptoStandaloneMm with OpensslLib instance with no SSL or EC services diff --git a/CryptoPkg/Library/BaseCryptLib/Hash/CryptParallelHash.h b/CryptoPkg/Library/BaseCryptLib/Hash/CryptParallelHash.h index 03a1a58cb8e7..c65bffa85d9e 100644 --- a/CryptoPkg/Library/BaseCryptLib/Hash/CryptParallelHash.h +++ b/CryptoPkg/Library/BaseCryptLib/Hash/CryptParallelHash.h @@ -66,7 +66,8 @@ SHA3_squeeze ( uint64_t A[5][5], unsigned char *out, size_t len, - size_t r + size_t r, + int next ); /** diff --git a/CryptoPkg/Library/BaseCryptLib/Hash/CryptSha3.c b/CryptoPkg/Library/BaseCryptLib/Hash/CryptSha3.c index 6abafc3c00e6..f996778706a7 100644 --- a/CryptoPkg/Library/BaseCryptLib/Hash/CryptSha3.c +++ b/CryptoPkg/Library/BaseCryptLib/Hash/CryptSha3.c @@ -160,7 +160,7 @@ Sha3Final ( (void)SHA3_absorb (Context->A, Context->buf, BlockSize, BlockSize); - SHA3_squeeze (Context->A, MessageDigest, Context->md_size, BlockSize); + SHA3_squeeze (Context->A, MessageDigest, Context->md_size, BlockSize, 0); return 1; } diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/aes/aesv8-armx.S b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/aes/aesv8-armx.S index 1e61e46937d2..927f1bb7e7d3 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/aes/aesv8-armx.S +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/aes/aesv8-armx.S @@ -14,6 +14,8 @@ .align 5 aes_v8_set_encrypt_key: .Lenc_key: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-16]! add x29,sp,#0 mov x3,#-1 @@ -185,7 +187,7 @@ aes_v8_set_encrypt_key: .type aes_v8_set_decrypt_key,%function .align 5 aes_v8_set_decrypt_key: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 bl .Lenc_key @@ -219,13 +221,14 @@ aes_v8_set_decrypt_key: eor x0,x0,x0 // return value .Ldec_key_abort: ldp x29,x30,[sp],#16 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key .globl aes_v8_encrypt .type aes_v8_encrypt,%function .align 5 aes_v8_encrypt: + AARCH64_VALID_CALL_TARGET ldr w3,[x2,#240] ld1 {v0.4s},[x2],#16 ld1 {v2.16b},[x0] @@ -255,6 +258,7 @@ aes_v8_encrypt: .type aes_v8_decrypt,%function .align 5 aes_v8_decrypt: + AARCH64_VALID_CALL_TARGET ldr w3,[x2,#240] ld1 {v0.4s},[x2],#16 ld1 {v2.16b},[x0] @@ -284,6 +288,7 @@ aes_v8_decrypt: .type aes_v8_ecb_encrypt,%function .align 5 aes_v8_ecb_encrypt: + AARCH64_VALID_CALL_TARGET subs x2,x2,#16 // Original input data size bigger than 16, jump to big size processing. b.ne .Lecb_big_size @@ -1030,6 +1035,8 @@ aes_v8_ecb_encrypt: .type aes_v8_cbc_encrypt,%function .align 5 aes_v8_cbc_encrypt: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-16]! add x29,sp,#0 subs x2,x2,#16 @@ -1501,10 +1508,735 @@ aes_v8_cbc_encrypt: ldr x29,[sp],#16 ret .size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt +.globl aes_v8_ctr32_encrypt_blocks_unroll12_eor3 +.type aes_v8_ctr32_encrypt_blocks_unroll12_eor3,%function +.align 5 +aes_v8_ctr32_encrypt_blocks_unroll12_eor3: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-80]! + stp d8,d9,[sp, #16] + stp d10,d11,[sp, #32] + stp d12,d13,[sp, #48] + stp d14,d15,[sp, #64] + add x29,sp,#0 + + ldr w5,[x3,#240] + + ldr w8, [x4, #12] +#ifdef __AARCH64EB__ + ld1 {v24.16b},[x4] +#else + ld1 {v24.4s},[x4] +#endif + ld1 {v2.4s,v3.4s},[x3] // load key schedule... + sub w5,w5,#4 + cmp x2,#2 + add x7,x3,x5,lsl#4 // pointer to last round key + sub w5,w5,#2 + add x7, x7, #64 + ld1 {v1.4s},[x7] + add x7,x3,#32 + mov w6,w5 +#ifndef __AARCH64EB__ + rev w8, w8 +#endif + + orr v25.16b,v24.16b,v24.16b + add w10, w8, #1 + orr v26.16b,v24.16b,v24.16b + add w8, w8, #2 + orr v0.16b,v24.16b,v24.16b + rev w10, w10 + mov v25.s[3],w10 + b.ls .Lctr32_tail_unroll + cmp x2,#6 + rev w12, w8 + sub x2,x2,#3 // bias + mov v26.s[3],w12 + b.lo .Loop3x_ctr32_unroll + cmp x2,#9 + orr v27.16b,v24.16b,v24.16b + add w11, w8, #1 + orr v28.16b,v24.16b,v24.16b + add w13, w8, #2 + rev w11, w11 + orr v29.16b,v24.16b,v24.16b + add w8, w8, #3 + rev w13, w13 + mov v27.s[3],w11 + rev w14, w8 + mov v28.s[3],w13 + mov v29.s[3],w14 + sub x2,x2,#3 + b.lo .Loop6x_ctr32_unroll + + // push regs to stack when 12 data chunks are interleaved + stp x19,x20,[sp,#-16]! + stp x21,x22,[sp,#-16]! + stp x23,x24,[sp,#-16]! + stp d8,d9,[sp,#-32]! + stp d10,d11,[sp,#-32]! + + add w15,w8,#1 + add w19,w8,#2 + add w20,w8,#3 + add w21,w8,#4 + add w22,w8,#5 + add w8,w8,#6 + orr v30.16b,v24.16b,v24.16b + rev w15,w15 + orr v31.16b,v24.16b,v24.16b + rev w19,w19 + orr v8.16b,v24.16b,v24.16b + rev w20,w20 + orr v9.16b,v24.16b,v24.16b + rev w21,w21 + orr v10.16b,v24.16b,v24.16b + rev w22,w22 + orr v11.16b,v24.16b,v24.16b + rev w23,w8 + + sub x2,x2,#6 // bias + mov v30.s[3],w15 + mov v31.s[3],w19 + mov v8.s[3],w20 + mov v9.s[3],w21 + mov v10.s[3],w22 + mov v11.s[3],w23 + b .Loop12x_ctr32_unroll + +.align 4 +.Loop12x_ctr32_unroll: + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + aese v26.16b,v2.16b + aesmc v26.16b,v26.16b + aese v27.16b,v2.16b + aesmc v27.16b,v27.16b + aese v28.16b,v2.16b + aesmc v28.16b,v28.16b + aese v29.16b,v2.16b + aesmc v29.16b,v29.16b + aese v30.16b,v2.16b + aesmc v30.16b,v30.16b + aese v31.16b,v2.16b + aesmc v31.16b,v31.16b + aese v8.16b,v2.16b + aesmc v8.16b,v8.16b + aese v9.16b,v2.16b + aesmc v9.16b,v9.16b + aese v10.16b,v2.16b + aesmc v10.16b,v10.16b + aese v11.16b,v2.16b + aesmc v11.16b,v11.16b + ld1 {v2.4s},[x7],#16 + subs w6,w6,#2 + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + aese v25.16b,v3.16b + aesmc v25.16b,v25.16b + aese v26.16b,v3.16b + aesmc v26.16b,v26.16b + aese v27.16b,v3.16b + aesmc v27.16b,v27.16b + aese v28.16b,v3.16b + aesmc v28.16b,v28.16b + aese v29.16b,v3.16b + aesmc v29.16b,v29.16b + aese v30.16b,v3.16b + aesmc v30.16b,v30.16b + aese v31.16b,v3.16b + aesmc v31.16b,v31.16b + aese v8.16b,v3.16b + aesmc v8.16b,v8.16b + aese v9.16b,v3.16b + aesmc v9.16b,v9.16b + aese v10.16b,v3.16b + aesmc v10.16b,v10.16b + aese v11.16b,v3.16b + aesmc v11.16b,v11.16b + ld1 {v3.4s},[x7],#16 + b.gt .Loop12x_ctr32_unroll + + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + aese v26.16b,v2.16b + aesmc v26.16b,v26.16b + aese v27.16b,v2.16b + aesmc v27.16b,v27.16b + aese v28.16b,v2.16b + aesmc v28.16b,v28.16b + aese v29.16b,v2.16b + aesmc v29.16b,v29.16b + aese v30.16b,v2.16b + aesmc v30.16b,v30.16b + aese v31.16b,v2.16b + aesmc v31.16b,v31.16b + aese v8.16b,v2.16b + aesmc v8.16b,v8.16b + aese v9.16b,v2.16b + aesmc v9.16b,v9.16b + aese v10.16b,v2.16b + aesmc v10.16b,v10.16b + aese v11.16b,v2.16b + aesmc v11.16b,v11.16b + ld1 {v2.4s},[x7],#16 + + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + aese v25.16b,v3.16b + aesmc v25.16b,v25.16b + aese v26.16b,v3.16b + aesmc v26.16b,v26.16b + aese v27.16b,v3.16b + aesmc v27.16b,v27.16b + aese v28.16b,v3.16b + aesmc v28.16b,v28.16b + aese v29.16b,v3.16b + aesmc v29.16b,v29.16b + aese v30.16b,v3.16b + aesmc v30.16b,v30.16b + aese v31.16b,v3.16b + aesmc v31.16b,v31.16b + aese v8.16b,v3.16b + aesmc v8.16b,v8.16b + aese v9.16b,v3.16b + aesmc v9.16b,v9.16b + aese v10.16b,v3.16b + aesmc v10.16b,v10.16b + aese v11.16b,v3.16b + aesmc v11.16b,v11.16b + ld1 {v3.4s},[x7],#16 + + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + add w9,w8,#1 + add w10,w8,#2 + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + add w12,w8,#3 + add w11,w8,#4 + aese v26.16b,v2.16b + aesmc v26.16b,v26.16b + add w13,w8,#5 + add w14,w8,#6 + rev w9,w9 + aese v27.16b,v2.16b + aesmc v27.16b,v27.16b + add w15,w8,#7 + add w19,w8,#8 + rev w10,w10 + rev w12,w12 + aese v28.16b,v2.16b + aesmc v28.16b,v28.16b + add w20,w8,#9 + add w21,w8,#10 + rev w11,w11 + rev w13,w13 + aese v29.16b,v2.16b + aesmc v29.16b,v29.16b + add w22,w8,#11 + add w23,w8,#12 + rev w14,w14 + rev w15,w15 + aese v30.16b,v2.16b + aesmc v30.16b,v30.16b + rev w19,w19 + rev w20,w20 + aese v31.16b,v2.16b + aesmc v31.16b,v31.16b + rev w21,w21 + rev w22,w22 + aese v8.16b,v2.16b + aesmc v8.16b,v8.16b + rev w23,w23 + aese v9.16b,v2.16b + aesmc v9.16b,v9.16b + aese v10.16b,v2.16b + aesmc v10.16b,v10.16b + aese v11.16b,v2.16b + aesmc v11.16b,v11.16b + ld1 {v2.4s},[x7],#16 + + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + aese v25.16b,v3.16b + aesmc v25.16b,v25.16b + aese v26.16b,v3.16b + aesmc v26.16b,v26.16b + aese v27.16b,v3.16b + aesmc v27.16b,v27.16b + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + aese v28.16b,v3.16b + aesmc v28.16b,v28.16b + aese v29.16b,v3.16b + aesmc v29.16b,v29.16b + aese v30.16b,v3.16b + aesmc v30.16b,v30.16b + aese v31.16b,v3.16b + aesmc v31.16b,v31.16b + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + aese v8.16b,v3.16b + aesmc v8.16b,v8.16b + aese v9.16b,v3.16b + aesmc v9.16b,v9.16b + aese v10.16b,v3.16b + aesmc v10.16b,v10.16b + aese v11.16b,v3.16b + aesmc v11.16b,v11.16b + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 + ld1 {v3.4s},[x7],#16 + + mov x7, x3 + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + aese v26.16b,v2.16b + aesmc v26.16b,v26.16b + aese v27.16b,v2.16b + aesmc v27.16b,v27.16b + aese v28.16b,v2.16b + aesmc v28.16b,v28.16b + aese v29.16b,v2.16b + aesmc v29.16b,v29.16b + aese v30.16b,v2.16b + aesmc v30.16b,v30.16b + aese v31.16b,v2.16b + aesmc v31.16b,v31.16b + aese v8.16b,v2.16b + aesmc v8.16b,v8.16b + aese v9.16b,v2.16b + aesmc v9.16b,v9.16b + aese v10.16b,v2.16b + aesmc v10.16b,v10.16b + aese v11.16b,v2.16b + aesmc v11.16b,v11.16b + ld1 {v2.4s},[x7],#16 // re-pre-load rndkey[0] + + aese v24.16b,v3.16b +.inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b + orr v24.16b,v0.16b,v0.16b + aese v25.16b,v3.16b +.inst 0xce0164a5 //eor3 v5.16b,v5.16b,v1.16b,v25.16b + orr v25.16b,v0.16b,v0.16b + aese v26.16b,v3.16b +.inst 0xce0168c6 //eor3 v6.16b,v6.16b,v1.16b,v26.16b + orr v26.16b,v0.16b,v0.16b + aese v27.16b,v3.16b +.inst 0xce016ce7 //eor3 v7.16b,v7.16b,v1.16b,v27.16b + orr v27.16b,v0.16b,v0.16b + aese v28.16b,v3.16b +.inst 0xce017210 //eor3 v16.16b,v16.16b,v1.16b,v28.16b + orr v28.16b,v0.16b,v0.16b + aese v29.16b,v3.16b +.inst 0xce017631 //eor3 v17.16b,v17.16b,v1.16b,v29.16b + orr v29.16b,v0.16b,v0.16b + aese v30.16b,v3.16b +.inst 0xce017a52 //eor3 v18.16b,v18.16b,v1.16b,v30.16b + orr v30.16b,v0.16b,v0.16b + aese v31.16b,v3.16b +.inst 0xce017e73 //eor3 v19.16b,v19.16b,v1.16b,v31.16b + orr v31.16b,v0.16b,v0.16b + aese v8.16b,v3.16b +.inst 0xce012294 //eor3 v20.16b,v20.16b,v1.16b,v8.16b + orr v8.16b,v0.16b,v0.16b + aese v9.16b,v3.16b +.inst 0xce0126b5 //eor3 v21.16b,v21.16b,v1.16b,v9.16b + orr v9.16b,v0.16b,v0.16b + aese v10.16b,v3.16b +.inst 0xce012ad6 //eor3 v22.16b,v22.16b,v1.16b,v10.16b + orr v10.16b,v0.16b,v0.16b + aese v11.16b,v3.16b +.inst 0xce012ef7 //eor3 v23.16b,v23.16b,v1.16b,v11.16b + orr v11.16b,v0.16b,v0.16b + ld1 {v3.4s},[x7],#16 // re-pre-load rndkey[1] + + mov v24.s[3],w9 + mov v25.s[3],w10 + mov v26.s[3],w12 + mov v27.s[3],w11 + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + mov v28.s[3],w13 + mov v29.s[3],w14 + mov v30.s[3],w15 + mov v31.s[3],w19 + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 + mov v8.s[3],w20 + mov v9.s[3],w21 + mov v10.s[3],w22 + mov v11.s[3],w23 + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + mov w6,w5 + + add w8,w8,#12 + subs x2,x2,#12 + b.hs .Loop12x_ctr32_unroll + + // pop regs from stack when 12 data chunks are interleaved + ldp d10,d11,[sp],#32 + ldp d8,d9,[sp],#32 + ldp x23,x24,[sp],#16 + ldp x21,x22,[sp],#16 + ldp x19,x20,[sp],#16 + + add x2,x2,#12 + cbz x2,.Lctr32_done_unroll + sub w8,w8,#12 + + cmp x2,#2 + b.ls .Lctr32_tail_unroll + + cmp x2,#6 + sub x2,x2,#3 // bias + add w8,w8,#3 + b.lo .Loop3x_ctr32_unroll + + sub x2,x2,#3 + add w8,w8,#3 + b.lo .Loop6x_ctr32_unroll + +.align 4 +.Loop6x_ctr32_unroll: + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + aese v26.16b,v2.16b + aesmc v26.16b,v26.16b + aese v27.16b,v2.16b + aesmc v27.16b,v27.16b + aese v28.16b,v2.16b + aesmc v28.16b,v28.16b + aese v29.16b,v2.16b + aesmc v29.16b,v29.16b + ld1 {v2.4s},[x7],#16 + subs w6,w6,#2 + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + aese v25.16b,v3.16b + aesmc v25.16b,v25.16b + aese v26.16b,v3.16b + aesmc v26.16b,v26.16b + aese v27.16b,v3.16b + aesmc v27.16b,v27.16b + aese v28.16b,v3.16b + aesmc v28.16b,v28.16b + aese v29.16b,v3.16b + aesmc v29.16b,v29.16b + ld1 {v3.4s},[x7],#16 + b.gt .Loop6x_ctr32_unroll + + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + aese v26.16b,v2.16b + aesmc v26.16b,v26.16b + aese v27.16b,v2.16b + aesmc v27.16b,v27.16b + aese v28.16b,v2.16b + aesmc v28.16b,v28.16b + aese v29.16b,v2.16b + aesmc v29.16b,v29.16b + ld1 {v2.4s},[x7],#16 + + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + aese v25.16b,v3.16b + aesmc v25.16b,v25.16b + aese v26.16b,v3.16b + aesmc v26.16b,v26.16b + aese v27.16b,v3.16b + aesmc v27.16b,v27.16b + aese v28.16b,v3.16b + aesmc v28.16b,v28.16b + aese v29.16b,v3.16b + aesmc v29.16b,v29.16b + ld1 {v3.4s},[x7],#16 + + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + add w9,w8,#1 + add w10,w8,#2 + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + add w12,w8,#3 + add w11,w8,#4 + aese v26.16b,v2.16b + aesmc v26.16b,v26.16b + add w13,w8,#5 + add w14,w8,#6 + rev w9,w9 + aese v27.16b,v2.16b + aesmc v27.16b,v27.16b + rev w10,w10 + rev w12,w12 + aese v28.16b,v2.16b + aesmc v28.16b,v28.16b + rev w11,w11 + rev w13,w13 + aese v29.16b,v2.16b + aesmc v29.16b,v29.16b + rev w14,w14 + ld1 {v2.4s},[x7],#16 + + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + aese v25.16b,v3.16b + aesmc v25.16b,v25.16b + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + aese v26.16b,v3.16b + aesmc v26.16b,v26.16b + aese v27.16b,v3.16b + aesmc v27.16b,v27.16b + ld1 {v16.16b,v17.16b},[x0],#32 + aese v28.16b,v3.16b + aesmc v28.16b,v28.16b + aese v29.16b,v3.16b + aesmc v29.16b,v29.16b + ld1 {v3.4s},[x7],#16 + + mov x7, x3 + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + aese v26.16b,v2.16b + aesmc v26.16b,v26.16b + aese v27.16b,v2.16b + aesmc v27.16b,v27.16b + aese v28.16b,v2.16b + aesmc v28.16b,v28.16b + aese v29.16b,v2.16b + aesmc v29.16b,v29.16b + ld1 {v2.4s},[x7],#16 // re-pre-load rndkey[0] + + aese v24.16b,v3.16b +.inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b + aese v25.16b,v3.16b +.inst 0xce0164a5 //eor3 v5.16b,v5.16b,v1.16b,v25.16b + aese v26.16b,v3.16b +.inst 0xce0168c6 //eor3 v6.16b,v6.16b,v1.16b,v26.16b + aese v27.16b,v3.16b +.inst 0xce016ce7 //eor3 v7.16b,v7.16b,v1.16b,v27.16b + aese v28.16b,v3.16b +.inst 0xce017210 //eor3 v16.16b,v16.16b,v1.16b,v28.16b + aese v29.16b,v3.16b +.inst 0xce017631 //eor3 v17.16b,v17.16b,v1.16b,v29.16b + ld1 {v3.4s},[x7],#16 // re-pre-load rndkey[1] + + orr v24.16b,v0.16b,v0.16b + orr v25.16b,v0.16b,v0.16b + orr v26.16b,v0.16b,v0.16b + orr v27.16b,v0.16b,v0.16b + orr v28.16b,v0.16b,v0.16b + orr v29.16b,v0.16b,v0.16b + + mov v24.s[3],w9 + mov v25.s[3],w10 + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + mov v26.s[3],w12 + mov v27.s[3],w11 + st1 {v16.16b,v17.16b},[x1],#32 + mov v28.s[3],w13 + mov v29.s[3],w14 + + cbz x2,.Lctr32_done_unroll + mov w6,w5 + + cmp x2,#2 + b.ls .Lctr32_tail_unroll + + sub x2,x2,#3 // bias + add w8,w8,#3 + b .Loop3x_ctr32_unroll + +.align 4 +.Loop3x_ctr32_unroll: + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + aese v26.16b,v2.16b + aesmc v26.16b,v26.16b + ld1 {v2.4s},[x7],#16 + subs w6,w6,#2 + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + aese v25.16b,v3.16b + aesmc v25.16b,v25.16b + aese v26.16b,v3.16b + aesmc v26.16b,v26.16b + ld1 {v3.4s},[x7],#16 + b.gt .Loop3x_ctr32_unroll + + aese v24.16b,v2.16b + aesmc v9.16b,v24.16b + aese v25.16b,v2.16b + aesmc v10.16b,v25.16b + ld1 {v4.16b,v5.16b,v6.16b},[x0],#48 + orr v24.16b,v0.16b,v0.16b + aese v26.16b,v2.16b + aesmc v26.16b,v26.16b + ld1 {v2.4s},[x7],#16 + orr v25.16b,v0.16b,v0.16b + aese v9.16b,v3.16b + aesmc v9.16b,v9.16b + aese v10.16b,v3.16b + aesmc v10.16b,v10.16b + aese v26.16b,v3.16b + aesmc v11.16b,v26.16b + ld1 {v3.4s},[x7],#16 + orr v26.16b,v0.16b,v0.16b + add w9,w8,#1 + aese v9.16b,v2.16b + aesmc v9.16b,v9.16b + aese v10.16b,v2.16b + aesmc v10.16b,v10.16b + add w10,w8,#2 + aese v11.16b,v2.16b + aesmc v11.16b,v11.16b + ld1 {v2.4s},[x7],#16 + add w8,w8,#3 + aese v9.16b,v3.16b + aesmc v9.16b,v9.16b + aese v10.16b,v3.16b + aesmc v10.16b,v10.16b + + rev w9,w9 + aese v11.16b,v3.16b + aesmc v11.16b,v11.16b + ld1 {v3.4s},[x7],#16 + mov v24.s[3], w9 + mov x7,x3 + rev w10,w10 + aese v9.16b,v2.16b + aesmc v9.16b,v9.16b + + aese v10.16b,v2.16b + aesmc v10.16b,v10.16b + mov v25.s[3], w10 + rev w12,w8 + aese v11.16b,v2.16b + aesmc v11.16b,v11.16b + mov v26.s[3], w12 + + aese v9.16b,v3.16b + aese v10.16b,v3.16b + aese v11.16b,v3.16b + +.inst 0xce012484 //eor3 v4.16b,v4.16b,v1.16b,v9.16b + ld1 {v2.4s},[x7],#16 // re-pre-load rndkey[0] +.inst 0xce0128a5 //eor3 v5.16b,v5.16b,v1.16b,v10.16b + mov w6,w5 +.inst 0xce012cc6 //eor3 v6.16b,v6.16b,v1.16b,v11.16b + ld1 {v3.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v4.16b,v5.16b,v6.16b},[x1],#48 + + cbz x2,.Lctr32_done_unroll + +.Lctr32_tail_unroll: + cmp x2,#1 + b.eq .Lctr32_tail_1_unroll + +.Lctr32_tail_2_unroll: + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + ld1 {v2.4s},[x7],#16 + subs w6,w6,#2 + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + aese v25.16b,v3.16b + aesmc v25.16b,v25.16b + ld1 {v3.4s},[x7],#16 + b.gt .Lctr32_tail_2_unroll + + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + ld1 {v2.4s},[x7],#16 + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + aese v25.16b,v3.16b + aesmc v25.16b,v25.16b + ld1 {v3.4s},[x7],#16 + ld1 {v4.16b,v5.16b},[x0],#32 + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + ld1 {v2.4s},[x7],#16 + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + aese v25.16b,v3.16b + aesmc v25.16b,v25.16b + ld1 {v3.4s},[x7],#16 + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v25.16b,v2.16b + aesmc v25.16b,v25.16b + aese v24.16b,v3.16b + aese v25.16b,v3.16b + +.inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b +.inst 0xce0164a5 //eor3 v5.16b,v5.16b,v1.16b,v25.16b + st1 {v4.16b,v5.16b},[x1],#32 + b .Lctr32_done_unroll + +.Lctr32_tail_1_unroll: + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + ld1 {v2.4s},[x7],#16 + subs w6,w6,#2 + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + ld1 {v3.4s},[x7],#16 + b.gt .Lctr32_tail_1_unroll + + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + ld1 {v2.4s},[x7],#16 + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + ld1 {v3.4s},[x7],#16 + ld1 {v4.16b},[x0] + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + ld1 {v2.4s},[x7],#16 + aese v24.16b,v3.16b + aesmc v24.16b,v24.16b + ld1 {v3.4s},[x7],#16 + aese v24.16b,v2.16b + aesmc v24.16b,v24.16b + aese v24.16b,v3.16b + +.inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b + st1 {v4.16b},[x1],#16 + +.Lctr32_done_unroll: + ldp d8,d9,[sp, #16] + ldp d10,d11,[sp, #32] + ldp d12,d13,[sp, #48] + ldp d15,d16,[sp, #64] + ldr x29,[sp],#80 + ret +.size aes_v8_ctr32_encrypt_blocks_unroll12_eor3,.-aes_v8_ctr32_encrypt_blocks_unroll12_eor3 .globl aes_v8_ctr32_encrypt_blocks .type aes_v8_ctr32_encrypt_blocks,%function .align 5 aes_v8_ctr32_encrypt_blocks: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-16]! add x29,sp,#0 ldr w5,[x3,#240] @@ -1854,6 +2586,7 @@ aes_v8_ctr32_encrypt_blocks: .type aes_v8_xts_encrypt,%function .align 5 aes_v8_xts_encrypt: + AARCH64_VALID_CALL_TARGET cmp x2,#16 // Original input data size bigger than 16, jump to big size processing. b.ne .Lxts_enc_big_size @@ -2495,6 +3228,7 @@ aes_v8_xts_encrypt: .type aes_v8_xts_decrypt,%function .align 5 aes_v8_xts_decrypt: + AARCH64_VALID_CALL_TARGET cmp x2,#16 // Original input data size bigger than 16, jump to big size processing. b.ne .Lxts_dec_big_size @@ -3104,7 +3838,7 @@ aes_v8_xts_decrypt: cbnz x2,.Lxts_dec_1st_done ld1 {v0.16b},[x0],#16 - // Decrypt the last secod block to get the last plain text block + // Decrypt the last second block to get the last plain text block .Lxts_dec_1st_done: eor v26.16b,v0.16b,v8.16b ldr w6,[x3,#240] diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/aes/bsaes-armv8.S b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/aes/bsaes-armv8.S new file mode 100644 index 000000000000..8a8668262a73 --- /dev/null +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/aes/bsaes-armv8.S @@ -0,0 +1,2347 @@ +// Copyright 2021-2024 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// ==================================================================== +// Written by Ben Avison for the OpenSSL +// project. Rights for redistribution and usage in source and binary +// forms are granted according to the OpenSSL license. +// ==================================================================== +// +// This implementation is a translation of bsaes-armv7 for AArch64. +// No attempt has been made to carry across the build switches for +// kernel targets, since the Linux kernel crypto support has moved on +// from when it was based on OpenSSL. + +// A lot of hand-scheduling has been performed. Consequently, this code +// doesn't factor out neatly into macros in the same way that the +// AArch32 version did, and there is little to be gained by wrapping it +// up in Perl, and it is presented as pure assembly. + + +#include "crypto/arm_arch.h" + +.text + + + + + +.type _bsaes_decrypt8,%function +.align 4 +// On entry: +// x9 -> key (previously expanded using _bsaes_key_convert) +// x10 = number of rounds +// v0-v7 input data +// On exit: +// x9-x11 corrupted +// other general-purpose registers preserved +// v0-v7 output data +// v11-v15 preserved +// other SIMD registers corrupted +_bsaes_decrypt8: + ldr q8, [x9], #16 + adr x11, .LM0ISR + movi v9.16b, #0x55 + ldr q10, [x11], #16 + movi v16.16b, #0x33 + movi v17.16b, #0x0f + sub x10, x10, #1 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v8.16b + eor v2.16b, v2.16b, v8.16b + eor v4.16b, v4.16b, v8.16b + eor v3.16b, v3.16b, v8.16b + eor v5.16b, v5.16b, v8.16b + tbl v0.16b, {v0.16b}, v10.16b + tbl v1.16b, {v1.16b}, v10.16b + tbl v2.16b, {v2.16b}, v10.16b + tbl v4.16b, {v4.16b}, v10.16b + eor v6.16b, v6.16b, v8.16b + eor v7.16b, v7.16b, v8.16b + tbl v3.16b, {v3.16b}, v10.16b + tbl v5.16b, {v5.16b}, v10.16b + tbl v6.16b, {v6.16b}, v10.16b + ushr v8.2d, v0.2d, #1 + tbl v7.16b, {v7.16b}, v10.16b + ushr v10.2d, v4.2d, #1 + ushr v18.2d, v2.2d, #1 + eor v8.16b, v8.16b, v1.16b + ushr v19.2d, v6.2d, #1 + eor v10.16b, v10.16b, v5.16b + eor v18.16b, v18.16b, v3.16b + and v8.16b, v8.16b, v9.16b + eor v19.16b, v19.16b, v7.16b + and v10.16b, v10.16b, v9.16b + and v18.16b, v18.16b, v9.16b + eor v1.16b, v1.16b, v8.16b + shl v8.2d, v8.2d, #1 + and v9.16b, v19.16b, v9.16b + eor v5.16b, v5.16b, v10.16b + shl v10.2d, v10.2d, #1 + eor v3.16b, v3.16b, v18.16b + shl v18.2d, v18.2d, #1 + eor v0.16b, v0.16b, v8.16b + shl v8.2d, v9.2d, #1 + eor v7.16b, v7.16b, v9.16b + eor v4.16b, v4.16b, v10.16b + eor v2.16b, v2.16b, v18.16b + ushr v9.2d, v1.2d, #2 + eor v6.16b, v6.16b, v8.16b + ushr v8.2d, v0.2d, #2 + ushr v10.2d, v5.2d, #2 + ushr v18.2d, v4.2d, #2 + eor v9.16b, v9.16b, v3.16b + eor v8.16b, v8.16b, v2.16b + eor v10.16b, v10.16b, v7.16b + eor v18.16b, v18.16b, v6.16b + and v9.16b, v9.16b, v16.16b + and v8.16b, v8.16b, v16.16b + and v10.16b, v10.16b, v16.16b + and v16.16b, v18.16b, v16.16b + eor v3.16b, v3.16b, v9.16b + shl v9.2d, v9.2d, #2 + eor v2.16b, v2.16b, v8.16b + shl v8.2d, v8.2d, #2 + eor v7.16b, v7.16b, v10.16b + shl v10.2d, v10.2d, #2 + eor v6.16b, v6.16b, v16.16b + shl v16.2d, v16.2d, #2 + eor v1.16b, v1.16b, v9.16b + eor v0.16b, v0.16b, v8.16b + eor v5.16b, v5.16b, v10.16b + eor v4.16b, v4.16b, v16.16b + ushr v8.2d, v3.2d, #4 + ushr v9.2d, v2.2d, #4 + ushr v10.2d, v1.2d, #4 + ushr v16.2d, v0.2d, #4 + eor v8.16b, v8.16b, v7.16b + eor v9.16b, v9.16b, v6.16b + eor v10.16b, v10.16b, v5.16b + eor v16.16b, v16.16b, v4.16b + and v8.16b, v8.16b, v17.16b + and v9.16b, v9.16b, v17.16b + and v10.16b, v10.16b, v17.16b + and v16.16b, v16.16b, v17.16b + eor v7.16b, v7.16b, v8.16b + shl v8.2d, v8.2d, #4 + eor v6.16b, v6.16b, v9.16b + shl v9.2d, v9.2d, #4 + eor v5.16b, v5.16b, v10.16b + shl v10.2d, v10.2d, #4 + eor v4.16b, v4.16b, v16.16b + shl v16.2d, v16.2d, #4 + eor v3.16b, v3.16b, v8.16b + eor v2.16b, v2.16b, v9.16b + eor v1.16b, v1.16b, v10.16b + eor v0.16b, v0.16b, v16.16b + b .Ldec_sbox +.align 4 +.Ldec_loop: + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 + ldp q8, q9, [x9], #32 + eor v0.16b, v16.16b, v0.16b + ldr q10, [x9], #16 + eor v1.16b, v17.16b, v1.16b + ldr q16, [x9], #16 + eor v2.16b, v18.16b, v2.16b + eor v3.16b, v19.16b, v3.16b + eor v4.16b, v8.16b, v4.16b + eor v5.16b, v9.16b, v5.16b + eor v6.16b, v10.16b, v6.16b + eor v7.16b, v16.16b, v7.16b + tbl v0.16b, {v0.16b}, v28.16b + tbl v1.16b, {v1.16b}, v28.16b + tbl v2.16b, {v2.16b}, v28.16b + tbl v3.16b, {v3.16b}, v28.16b + tbl v4.16b, {v4.16b}, v28.16b + tbl v5.16b, {v5.16b}, v28.16b + tbl v6.16b, {v6.16b}, v28.16b + tbl v7.16b, {v7.16b}, v28.16b +.Ldec_sbox: + eor v1.16b, v1.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + subs x10, x10, #1 + eor v4.16b, v4.16b, v7.16b + eor v2.16b, v2.16b, v7.16b + eor v1.16b, v1.16b, v6.16b + eor v6.16b, v6.16b, v4.16b + eor v2.16b, v2.16b, v5.16b + eor v0.16b, v0.16b, v1.16b + eor v7.16b, v7.16b, v6.16b + eor v8.16b, v6.16b, v2.16b + and v9.16b, v4.16b, v6.16b + eor v10.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v16.16b, v7.16b, v4.16b + eor v17.16b, v4.16b, v0.16b + and v18.16b, v0.16b, v2.16b + eor v19.16b, v7.16b, v4.16b + eor v1.16b, v1.16b, v3.16b + eor v20.16b, v3.16b, v0.16b + eor v21.16b, v5.16b, v2.16b + eor v22.16b, v3.16b, v7.16b + and v8.16b, v17.16b, v8.16b + orr v17.16b, v3.16b, v5.16b + eor v23.16b, v1.16b, v6.16b + eor v24.16b, v20.16b, v16.16b + eor v25.16b, v1.16b, v5.16b + orr v26.16b, v20.16b, v21.16b + and v20.16b, v20.16b, v21.16b + and v27.16b, v7.16b, v1.16b + eor v21.16b, v21.16b, v23.16b + orr v28.16b, v16.16b, v23.16b + orr v29.16b, v22.16b, v25.16b + eor v26.16b, v26.16b, v8.16b + and v16.16b, v16.16b, v23.16b + and v22.16b, v22.16b, v25.16b + and v21.16b, v24.16b, v21.16b + eor v8.16b, v28.16b, v8.16b + eor v23.16b, v5.16b, v2.16b + eor v24.16b, v1.16b, v6.16b + eor v16.16b, v16.16b, v22.16b + eor v22.16b, v3.16b, v0.16b + eor v25.16b, v29.16b, v21.16b + eor v21.16b, v26.16b, v21.16b + eor v8.16b, v8.16b, v20.16b + eor v26.16b, v23.16b, v24.16b + eor v16.16b, v16.16b, v20.16b + eor v28.16b, v22.16b, v19.16b + eor v20.16b, v25.16b, v20.16b + eor v9.16b, v21.16b, v9.16b + eor v8.16b, v8.16b, v18.16b + eor v18.16b, v5.16b, v1.16b + eor v21.16b, v16.16b, v17.16b + eor v16.16b, v16.16b, v17.16b + eor v17.16b, v20.16b, v27.16b + eor v20.16b, v3.16b, v7.16b + eor v25.16b, v9.16b, v8.16b + eor v27.16b, v0.16b, v4.16b + and v29.16b, v9.16b, v17.16b + eor v30.16b, v8.16b, v29.16b + eor v31.16b, v21.16b, v29.16b + eor v29.16b, v21.16b, v29.16b + bsl v30.16b, v17.16b, v21.16b + bsl v31.16b, v9.16b, v8.16b + bsl v16.16b, v30.16b, v29.16b + bsl v21.16b, v29.16b, v30.16b + eor v8.16b, v31.16b, v30.16b + and v1.16b, v1.16b, v31.16b + and v9.16b, v16.16b, v31.16b + and v6.16b, v6.16b, v30.16b + eor v16.16b, v17.16b, v21.16b + and v4.16b, v4.16b, v30.16b + eor v17.16b, v8.16b, v30.16b + and v21.16b, v24.16b, v8.16b + eor v9.16b, v9.16b, v25.16b + and v19.16b, v19.16b, v8.16b + eor v24.16b, v30.16b, v16.16b + eor v25.16b, v30.16b, v16.16b + and v7.16b, v7.16b, v17.16b + and v10.16b, v10.16b, v16.16b + eor v29.16b, v9.16b, v16.16b + eor v30.16b, v31.16b, v9.16b + and v0.16b, v24.16b, v0.16b + and v9.16b, v18.16b, v9.16b + and v2.16b, v25.16b, v2.16b + eor v10.16b, v10.16b, v6.16b + eor v18.16b, v29.16b, v16.16b + and v5.16b, v30.16b, v5.16b + eor v24.16b, v8.16b, v29.16b + and v25.16b, v26.16b, v29.16b + and v26.16b, v28.16b, v29.16b + eor v8.16b, v8.16b, v29.16b + eor v17.16b, v17.16b, v18.16b + eor v5.16b, v1.16b, v5.16b + and v23.16b, v24.16b, v23.16b + eor v21.16b, v21.16b, v25.16b + eor v19.16b, v19.16b, v26.16b + eor v0.16b, v4.16b, v0.16b + and v3.16b, v17.16b, v3.16b + eor v1.16b, v9.16b, v1.16b + eor v9.16b, v25.16b, v23.16b + eor v5.16b, v5.16b, v21.16b + eor v2.16b, v6.16b, v2.16b + and v6.16b, v8.16b, v22.16b + eor v3.16b, v7.16b, v3.16b + and v8.16b, v20.16b, v18.16b + eor v10.16b, v10.16b, v9.16b + eor v0.16b, v0.16b, v19.16b + eor v9.16b, v1.16b, v9.16b + eor v1.16b, v2.16b, v21.16b + eor v3.16b, v3.16b, v19.16b + and v16.16b, v27.16b, v16.16b + eor v17.16b, v26.16b, v6.16b + eor v6.16b, v8.16b, v7.16b + eor v7.16b, v1.16b, v9.16b + eor v1.16b, v5.16b, v3.16b + eor v2.16b, v10.16b, v3.16b + eor v4.16b, v16.16b, v4.16b + eor v8.16b, v6.16b, v17.16b + eor v5.16b, v9.16b, v3.16b + eor v9.16b, v0.16b, v1.16b + eor v6.16b, v7.16b, v1.16b + eor v0.16b, v4.16b, v17.16b + eor v4.16b, v8.16b, v7.16b + eor v7.16b, v9.16b, v2.16b + eor v8.16b, v3.16b, v0.16b + eor v7.16b, v7.16b, v5.16b + eor v3.16b, v4.16b, v7.16b + eor v4.16b, v7.16b, v0.16b + eor v7.16b, v8.16b, v3.16b + bcc .Ldec_done + ext v8.16b, v0.16b, v0.16b, #8 + ext v9.16b, v1.16b, v1.16b, #8 + ldr q28, [x11] // load from .LISR in common case (x10 > 0) + ext v10.16b, v6.16b, v6.16b, #8 + ext v16.16b, v3.16b, v3.16b, #8 + ext v17.16b, v5.16b, v5.16b, #8 + ext v18.16b, v4.16b, v4.16b, #8 + eor v8.16b, v8.16b, v0.16b + eor v9.16b, v9.16b, v1.16b + eor v10.16b, v10.16b, v6.16b + eor v16.16b, v16.16b, v3.16b + eor v17.16b, v17.16b, v5.16b + ext v19.16b, v2.16b, v2.16b, #8 + ext v20.16b, v7.16b, v7.16b, #8 + eor v18.16b, v18.16b, v4.16b + eor v6.16b, v6.16b, v8.16b + eor v8.16b, v2.16b, v10.16b + eor v4.16b, v4.16b, v9.16b + eor v2.16b, v19.16b, v2.16b + eor v9.16b, v20.16b, v7.16b + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v16.16b + eor v6.16b, v6.16b, v17.16b + eor v8.16b, v8.16b, v16.16b + eor v7.16b, v7.16b, v18.16b + eor v4.16b, v4.16b, v16.16b + eor v2.16b, v3.16b, v2.16b + eor v1.16b, v1.16b, v17.16b + eor v3.16b, v5.16b, v9.16b + eor v5.16b, v8.16b, v17.16b + eor v7.16b, v7.16b, v17.16b + ext v8.16b, v0.16b, v0.16b, #12 + ext v9.16b, v6.16b, v6.16b, #12 + ext v10.16b, v4.16b, v4.16b, #12 + ext v16.16b, v1.16b, v1.16b, #12 + ext v17.16b, v5.16b, v5.16b, #12 + ext v18.16b, v7.16b, v7.16b, #12 + eor v0.16b, v0.16b, v8.16b + eor v6.16b, v6.16b, v9.16b + eor v4.16b, v4.16b, v10.16b + ext v19.16b, v2.16b, v2.16b, #12 + ext v20.16b, v3.16b, v3.16b, #12 + eor v1.16b, v1.16b, v16.16b + eor v5.16b, v5.16b, v17.16b + eor v7.16b, v7.16b, v18.16b + eor v2.16b, v2.16b, v19.16b + eor v16.16b, v16.16b, v0.16b + eor v3.16b, v3.16b, v20.16b + eor v17.16b, v17.16b, v4.16b + eor v10.16b, v10.16b, v6.16b + ext v0.16b, v0.16b, v0.16b, #8 + eor v9.16b, v9.16b, v1.16b + ext v1.16b, v1.16b, v1.16b, #8 + eor v8.16b, v8.16b, v3.16b + eor v16.16b, v16.16b, v3.16b + eor v18.16b, v18.16b, v5.16b + eor v19.16b, v19.16b, v7.16b + ext v21.16b, v5.16b, v5.16b, #8 + ext v5.16b, v7.16b, v7.16b, #8 + eor v7.16b, v20.16b, v2.16b + ext v4.16b, v4.16b, v4.16b, #8 + ext v20.16b, v3.16b, v3.16b, #8 + eor v17.16b, v17.16b, v3.16b + ext v2.16b, v2.16b, v2.16b, #8 + eor v3.16b, v10.16b, v3.16b + ext v10.16b, v6.16b, v6.16b, #8 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v16.16b + eor v5.16b, v5.16b, v18.16b + eor v3.16b, v3.16b, v4.16b + eor v7.16b, v20.16b, v7.16b + eor v6.16b, v2.16b, v19.16b + eor v4.16b, v21.16b, v17.16b + eor v2.16b, v10.16b, v9.16b + bne .Ldec_loop + ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0) + b .Ldec_loop +.align 4 +.Ldec_done: + ushr v8.2d, v0.2d, #1 + movi v9.16b, #0x55 + ldr q10, [x9] + ushr v16.2d, v2.2d, #1 + movi v17.16b, #0x33 + ushr v18.2d, v6.2d, #1 + movi v19.16b, #0x0f + eor v8.16b, v8.16b, v1.16b + ushr v20.2d, v3.2d, #1 + eor v16.16b, v16.16b, v7.16b + eor v18.16b, v18.16b, v4.16b + and v8.16b, v8.16b, v9.16b + eor v20.16b, v20.16b, v5.16b + and v16.16b, v16.16b, v9.16b + and v18.16b, v18.16b, v9.16b + shl v21.2d, v8.2d, #1 + eor v1.16b, v1.16b, v8.16b + and v8.16b, v20.16b, v9.16b + eor v7.16b, v7.16b, v16.16b + shl v9.2d, v16.2d, #1 + eor v4.16b, v4.16b, v18.16b + shl v16.2d, v18.2d, #1 + eor v0.16b, v0.16b, v21.16b + shl v18.2d, v8.2d, #1 + eor v5.16b, v5.16b, v8.16b + eor v2.16b, v2.16b, v9.16b + eor v6.16b, v6.16b, v16.16b + ushr v8.2d, v1.2d, #2 + eor v3.16b, v3.16b, v18.16b + ushr v9.2d, v0.2d, #2 + ushr v16.2d, v7.2d, #2 + ushr v18.2d, v2.2d, #2 + eor v8.16b, v8.16b, v4.16b + eor v9.16b, v9.16b, v6.16b + eor v16.16b, v16.16b, v5.16b + eor v18.16b, v18.16b, v3.16b + and v8.16b, v8.16b, v17.16b + and v9.16b, v9.16b, v17.16b + and v16.16b, v16.16b, v17.16b + and v17.16b, v18.16b, v17.16b + eor v4.16b, v4.16b, v8.16b + shl v8.2d, v8.2d, #2 + eor v6.16b, v6.16b, v9.16b + shl v9.2d, v9.2d, #2 + eor v5.16b, v5.16b, v16.16b + shl v16.2d, v16.2d, #2 + eor v3.16b, v3.16b, v17.16b + shl v17.2d, v17.2d, #2 + eor v1.16b, v1.16b, v8.16b + eor v0.16b, v0.16b, v9.16b + eor v7.16b, v7.16b, v16.16b + eor v2.16b, v2.16b, v17.16b + ushr v8.2d, v4.2d, #4 + ushr v9.2d, v6.2d, #4 + ushr v16.2d, v1.2d, #4 + ushr v17.2d, v0.2d, #4 + eor v8.16b, v8.16b, v5.16b + eor v9.16b, v9.16b, v3.16b + eor v16.16b, v16.16b, v7.16b + eor v17.16b, v17.16b, v2.16b + and v8.16b, v8.16b, v19.16b + and v9.16b, v9.16b, v19.16b + and v16.16b, v16.16b, v19.16b + and v17.16b, v17.16b, v19.16b + eor v5.16b, v5.16b, v8.16b + shl v8.2d, v8.2d, #4 + eor v3.16b, v3.16b, v9.16b + shl v9.2d, v9.2d, #4 + eor v7.16b, v7.16b, v16.16b + shl v16.2d, v16.2d, #4 + eor v2.16b, v2.16b, v17.16b + shl v17.2d, v17.2d, #4 + eor v4.16b, v4.16b, v8.16b + eor v6.16b, v6.16b, v9.16b + eor v7.16b, v7.16b, v10.16b + eor v1.16b, v1.16b, v16.16b + eor v2.16b, v2.16b, v10.16b + eor v0.16b, v0.16b, v17.16b + eor v4.16b, v4.16b, v10.16b + eor v6.16b, v6.16b, v10.16b + eor v3.16b, v3.16b, v10.16b + eor v5.16b, v5.16b, v10.16b + eor v1.16b, v1.16b, v10.16b + eor v0.16b, v0.16b, v10.16b + ret +.size _bsaes_decrypt8,.-_bsaes_decrypt8 + +.type _bsaes_const,%object +.align 6 +_bsaes_const: +// InvShiftRows constants +// Used in _bsaes_decrypt8, which assumes contiguity +// .LM0ISR used with round 0 key +// .LISR used with middle round keys +// .LISRM0 used with final round key +.LM0ISR: +.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 +.LISR: +.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 +.LISRM0: +.quad 0x01040b0e0205080f, 0x0306090c00070a0d + +// ShiftRows constants +// Used in _bsaes_encrypt8, which assumes contiguity +// .LM0SR used with round 0 key +// .LSR used with middle round keys +// .LSRM0 used with final round key +.LM0SR: +.quad 0x0a0e02060f03070b, 0x0004080c05090d01 +.LSR: +.quad 0x0504070600030201, 0x0f0e0d0c0a09080b +.LSRM0: +.quad 0x0304090e00050a0f, 0x01060b0c0207080d + +.LM0_bigendian: +.quad 0x02060a0e03070b0f, 0x0004080c0105090d +.LM0_littleendian: +.quad 0x0105090d0004080c, 0x03070b0f02060a0e + +// Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into +// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR +.LREVM0SR: +.quad 0x090d01050c000408, 0x03070b0f060a0e02 + +.align 6 +.size _bsaes_const,.-_bsaes_const + +.type _bsaes_encrypt8,%function +.align 4 +// On entry: +// x9 -> key (previously expanded using _bsaes_key_convert) +// x10 = number of rounds +// v0-v7 input data +// On exit: +// x9-x11 corrupted +// other general-purpose registers preserved +// v0-v7 output data +// v11-v15 preserved +// other SIMD registers corrupted +_bsaes_encrypt8: + ldr q8, [x9], #16 + adr x11, .LM0SR + ldr q9, [x11], #16 +_bsaes_encrypt8_alt: + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v8.16b + sub x10, x10, #1 + eor v2.16b, v2.16b, v8.16b + eor v4.16b, v4.16b, v8.16b + eor v3.16b, v3.16b, v8.16b + eor v5.16b, v5.16b, v8.16b + tbl v0.16b, {v0.16b}, v9.16b + tbl v1.16b, {v1.16b}, v9.16b + tbl v2.16b, {v2.16b}, v9.16b + tbl v4.16b, {v4.16b}, v9.16b + eor v6.16b, v6.16b, v8.16b + eor v7.16b, v7.16b, v8.16b + tbl v3.16b, {v3.16b}, v9.16b + tbl v5.16b, {v5.16b}, v9.16b + tbl v6.16b, {v6.16b}, v9.16b + ushr v8.2d, v0.2d, #1 + movi v10.16b, #0x55 + tbl v7.16b, {v7.16b}, v9.16b + ushr v9.2d, v4.2d, #1 + movi v16.16b, #0x33 + ushr v17.2d, v2.2d, #1 + eor v8.16b, v8.16b, v1.16b + movi v18.16b, #0x0f + ushr v19.2d, v6.2d, #1 + eor v9.16b, v9.16b, v5.16b + eor v17.16b, v17.16b, v3.16b + and v8.16b, v8.16b, v10.16b + eor v19.16b, v19.16b, v7.16b + and v9.16b, v9.16b, v10.16b + and v17.16b, v17.16b, v10.16b + eor v1.16b, v1.16b, v8.16b + shl v8.2d, v8.2d, #1 + and v10.16b, v19.16b, v10.16b + eor v5.16b, v5.16b, v9.16b + shl v9.2d, v9.2d, #1 + eor v3.16b, v3.16b, v17.16b + shl v17.2d, v17.2d, #1 + eor v0.16b, v0.16b, v8.16b + shl v8.2d, v10.2d, #1 + eor v7.16b, v7.16b, v10.16b + eor v4.16b, v4.16b, v9.16b + eor v2.16b, v2.16b, v17.16b + ushr v9.2d, v1.2d, #2 + eor v6.16b, v6.16b, v8.16b + ushr v8.2d, v0.2d, #2 + ushr v10.2d, v5.2d, #2 + ushr v17.2d, v4.2d, #2 + eor v9.16b, v9.16b, v3.16b + eor v8.16b, v8.16b, v2.16b + eor v10.16b, v10.16b, v7.16b + eor v17.16b, v17.16b, v6.16b + and v9.16b, v9.16b, v16.16b + and v8.16b, v8.16b, v16.16b + and v10.16b, v10.16b, v16.16b + and v16.16b, v17.16b, v16.16b + eor v3.16b, v3.16b, v9.16b + shl v9.2d, v9.2d, #2 + eor v2.16b, v2.16b, v8.16b + shl v8.2d, v8.2d, #2 + eor v7.16b, v7.16b, v10.16b + shl v10.2d, v10.2d, #2 + eor v6.16b, v6.16b, v16.16b + shl v16.2d, v16.2d, #2 + eor v1.16b, v1.16b, v9.16b + eor v0.16b, v0.16b, v8.16b + eor v5.16b, v5.16b, v10.16b + eor v4.16b, v4.16b, v16.16b + ushr v8.2d, v3.2d, #4 + ushr v9.2d, v2.2d, #4 + ushr v10.2d, v1.2d, #4 + ushr v16.2d, v0.2d, #4 + eor v8.16b, v8.16b, v7.16b + eor v9.16b, v9.16b, v6.16b + eor v10.16b, v10.16b, v5.16b + eor v16.16b, v16.16b, v4.16b + and v8.16b, v8.16b, v18.16b + and v9.16b, v9.16b, v18.16b + and v10.16b, v10.16b, v18.16b + and v16.16b, v16.16b, v18.16b + eor v7.16b, v7.16b, v8.16b + shl v8.2d, v8.2d, #4 + eor v6.16b, v6.16b, v9.16b + shl v9.2d, v9.2d, #4 + eor v5.16b, v5.16b, v10.16b + shl v10.2d, v10.2d, #4 + eor v4.16b, v4.16b, v16.16b + shl v16.2d, v16.2d, #4 + eor v3.16b, v3.16b, v8.16b + eor v2.16b, v2.16b, v9.16b + eor v1.16b, v1.16b, v10.16b + eor v0.16b, v0.16b, v16.16b + b .Lenc_sbox +.align 4 +.Lenc_loop: + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 + ldp q8, q9, [x9], #32 + eor v0.16b, v16.16b, v0.16b + ldr q10, [x9], #16 + eor v1.16b, v17.16b, v1.16b + ldr q16, [x9], #16 + eor v2.16b, v18.16b, v2.16b + eor v3.16b, v19.16b, v3.16b + eor v4.16b, v8.16b, v4.16b + eor v5.16b, v9.16b, v5.16b + eor v6.16b, v10.16b, v6.16b + eor v7.16b, v16.16b, v7.16b + tbl v0.16b, {v0.16b}, v28.16b + tbl v1.16b, {v1.16b}, v28.16b + tbl v2.16b, {v2.16b}, v28.16b + tbl v3.16b, {v3.16b}, v28.16b + tbl v4.16b, {v4.16b}, v28.16b + tbl v5.16b, {v5.16b}, v28.16b + tbl v6.16b, {v6.16b}, v28.16b + tbl v7.16b, {v7.16b}, v28.16b +.Lenc_sbox: + eor v5.16b, v5.16b, v6.16b + eor v3.16b, v3.16b, v0.16b + subs x10, x10, #1 + eor v2.16b, v2.16b, v1.16b + eor v5.16b, v5.16b, v0.16b + eor v8.16b, v3.16b, v7.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v5.16b + eor v8.16b, v8.16b, v4.16b + eor v3.16b, v6.16b, v3.16b + eor v4.16b, v4.16b, v5.16b + eor v6.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v7.16b + eor v1.16b, v8.16b, v1.16b + eor v8.16b, v7.16b, v4.16b + eor v9.16b, v3.16b, v0.16b + eor v10.16b, v7.16b, v6.16b + eor v16.16b, v5.16b, v3.16b + eor v17.16b, v6.16b, v2.16b + eor v18.16b, v5.16b, v1.16b + eor v19.16b, v2.16b, v4.16b + eor v20.16b, v1.16b, v0.16b + orr v21.16b, v8.16b, v9.16b + orr v22.16b, v10.16b, v16.16b + eor v23.16b, v8.16b, v17.16b + eor v24.16b, v9.16b, v18.16b + and v19.16b, v19.16b, v20.16b + orr v20.16b, v17.16b, v18.16b + and v8.16b, v8.16b, v9.16b + and v9.16b, v17.16b, v18.16b + and v17.16b, v23.16b, v24.16b + and v10.16b, v10.16b, v16.16b + eor v16.16b, v21.16b, v19.16b + eor v18.16b, v20.16b, v19.16b + and v19.16b, v2.16b, v1.16b + and v20.16b, v6.16b, v5.16b + eor v21.16b, v22.16b, v17.16b + eor v9.16b, v9.16b, v10.16b + eor v10.16b, v16.16b, v17.16b + eor v16.16b, v18.16b, v8.16b + and v17.16b, v4.16b, v0.16b + orr v18.16b, v7.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v8.16b, v9.16b, v8.16b + eor v9.16b, v10.16b, v19.16b + eor v10.16b, v3.16b, v0.16b + eor v16.16b, v16.16b, v17.16b + eor v17.16b, v5.16b, v1.16b + eor v19.16b, v21.16b, v20.16b + eor v20.16b, v8.16b, v18.16b + eor v8.16b, v8.16b, v18.16b + eor v18.16b, v7.16b, v4.16b + eor v21.16b, v9.16b, v16.16b + eor v22.16b, v6.16b, v2.16b + and v23.16b, v9.16b, v19.16b + eor v24.16b, v10.16b, v17.16b + eor v25.16b, v0.16b, v1.16b + eor v26.16b, v7.16b, v6.16b + eor v27.16b, v18.16b, v22.16b + eor v28.16b, v3.16b, v5.16b + eor v29.16b, v16.16b, v23.16b + eor v30.16b, v20.16b, v23.16b + eor v23.16b, v20.16b, v23.16b + eor v31.16b, v4.16b, v2.16b + bsl v29.16b, v19.16b, v20.16b + bsl v30.16b, v9.16b, v16.16b + bsl v8.16b, v29.16b, v23.16b + bsl v20.16b, v23.16b, v29.16b + eor v9.16b, v30.16b, v29.16b + and v5.16b, v5.16b, v30.16b + and v8.16b, v8.16b, v30.16b + and v1.16b, v1.16b, v29.16b + eor v16.16b, v19.16b, v20.16b + and v2.16b, v2.16b, v29.16b + eor v19.16b, v9.16b, v29.16b + and v17.16b, v17.16b, v9.16b + eor v8.16b, v8.16b, v21.16b + and v20.16b, v22.16b, v9.16b + eor v21.16b, v29.16b, v16.16b + eor v22.16b, v29.16b, v16.16b + and v23.16b, v25.16b, v16.16b + and v6.16b, v6.16b, v19.16b + eor v25.16b, v8.16b, v16.16b + eor v29.16b, v30.16b, v8.16b + and v4.16b, v21.16b, v4.16b + and v8.16b, v28.16b, v8.16b + and v0.16b, v22.16b, v0.16b + eor v21.16b, v23.16b, v1.16b + eor v22.16b, v9.16b, v25.16b + eor v9.16b, v9.16b, v25.16b + eor v23.16b, v25.16b, v16.16b + and v3.16b, v29.16b, v3.16b + and v24.16b, v24.16b, v25.16b + and v25.16b, v27.16b, v25.16b + and v10.16b, v22.16b, v10.16b + and v9.16b, v9.16b, v18.16b + eor v18.16b, v19.16b, v23.16b + and v19.16b, v26.16b, v23.16b + eor v3.16b, v5.16b, v3.16b + eor v17.16b, v17.16b, v24.16b + eor v10.16b, v24.16b, v10.16b + and v16.16b, v31.16b, v16.16b + eor v20.16b, v20.16b, v25.16b + eor v9.16b, v25.16b, v9.16b + eor v4.16b, v2.16b, v4.16b + and v7.16b, v18.16b, v7.16b + eor v18.16b, v19.16b, v6.16b + eor v5.16b, v8.16b, v5.16b + eor v0.16b, v1.16b, v0.16b + eor v1.16b, v21.16b, v10.16b + eor v8.16b, v3.16b, v17.16b + eor v2.16b, v16.16b, v2.16b + eor v3.16b, v6.16b, v7.16b + eor v6.16b, v18.16b, v9.16b + eor v4.16b, v4.16b, v20.16b + eor v10.16b, v5.16b, v10.16b + eor v0.16b, v0.16b, v17.16b + eor v9.16b, v2.16b, v9.16b + eor v3.16b, v3.16b, v20.16b + eor v7.16b, v6.16b, v1.16b + eor v5.16b, v8.16b, v4.16b + eor v6.16b, v10.16b, v1.16b + eor v2.16b, v4.16b, v0.16b + eor v4.16b, v3.16b, v10.16b + eor v9.16b, v9.16b, v7.16b + eor v3.16b, v0.16b, v5.16b + eor v0.16b, v1.16b, v4.16b + eor v1.16b, v4.16b, v8.16b + eor v4.16b, v9.16b, v5.16b + eor v6.16b, v6.16b, v3.16b + bcc .Lenc_done + ext v8.16b, v0.16b, v0.16b, #12 + ext v9.16b, v4.16b, v4.16b, #12 + ldr q28, [x11] + ext v10.16b, v6.16b, v6.16b, #12 + ext v16.16b, v1.16b, v1.16b, #12 + ext v17.16b, v3.16b, v3.16b, #12 + ext v18.16b, v7.16b, v7.16b, #12 + eor v0.16b, v0.16b, v8.16b + eor v4.16b, v4.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + ext v19.16b, v2.16b, v2.16b, #12 + ext v20.16b, v5.16b, v5.16b, #12 + eor v1.16b, v1.16b, v16.16b + eor v3.16b, v3.16b, v17.16b + eor v7.16b, v7.16b, v18.16b + eor v2.16b, v2.16b, v19.16b + eor v16.16b, v16.16b, v0.16b + eor v5.16b, v5.16b, v20.16b + eor v17.16b, v17.16b, v6.16b + eor v10.16b, v10.16b, v4.16b + ext v0.16b, v0.16b, v0.16b, #8 + eor v9.16b, v9.16b, v1.16b + ext v1.16b, v1.16b, v1.16b, #8 + eor v8.16b, v8.16b, v5.16b + eor v16.16b, v16.16b, v5.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v7.16b + ext v3.16b, v3.16b, v3.16b, #8 + ext v7.16b, v7.16b, v7.16b, #8 + eor v20.16b, v20.16b, v2.16b + ext v6.16b, v6.16b, v6.16b, #8 + ext v21.16b, v5.16b, v5.16b, #8 + eor v17.16b, v17.16b, v5.16b + ext v2.16b, v2.16b, v2.16b, #8 + eor v10.16b, v10.16b, v5.16b + ext v22.16b, v4.16b, v4.16b, #8 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v16.16b + eor v5.16b, v7.16b, v18.16b + eor v4.16b, v3.16b, v17.16b + eor v3.16b, v6.16b, v10.16b + eor v7.16b, v21.16b, v20.16b + eor v6.16b, v2.16b, v19.16b + eor v2.16b, v22.16b, v9.16b + bne .Lenc_loop + ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0) + b .Lenc_loop +.align 4 +.Lenc_done: + ushr v8.2d, v0.2d, #1 + movi v9.16b, #0x55 + ldr q10, [x9] + ushr v16.2d, v3.2d, #1 + movi v17.16b, #0x33 + ushr v18.2d, v4.2d, #1 + movi v19.16b, #0x0f + eor v8.16b, v8.16b, v1.16b + ushr v20.2d, v2.2d, #1 + eor v16.16b, v16.16b, v7.16b + eor v18.16b, v18.16b, v6.16b + and v8.16b, v8.16b, v9.16b + eor v20.16b, v20.16b, v5.16b + and v16.16b, v16.16b, v9.16b + and v18.16b, v18.16b, v9.16b + shl v21.2d, v8.2d, #1 + eor v1.16b, v1.16b, v8.16b + and v8.16b, v20.16b, v9.16b + eor v7.16b, v7.16b, v16.16b + shl v9.2d, v16.2d, #1 + eor v6.16b, v6.16b, v18.16b + shl v16.2d, v18.2d, #1 + eor v0.16b, v0.16b, v21.16b + shl v18.2d, v8.2d, #1 + eor v5.16b, v5.16b, v8.16b + eor v3.16b, v3.16b, v9.16b + eor v4.16b, v4.16b, v16.16b + ushr v8.2d, v1.2d, #2 + eor v2.16b, v2.16b, v18.16b + ushr v9.2d, v0.2d, #2 + ushr v16.2d, v7.2d, #2 + ushr v18.2d, v3.2d, #2 + eor v8.16b, v8.16b, v6.16b + eor v9.16b, v9.16b, v4.16b + eor v16.16b, v16.16b, v5.16b + eor v18.16b, v18.16b, v2.16b + and v8.16b, v8.16b, v17.16b + and v9.16b, v9.16b, v17.16b + and v16.16b, v16.16b, v17.16b + and v17.16b, v18.16b, v17.16b + eor v6.16b, v6.16b, v8.16b + shl v8.2d, v8.2d, #2 + eor v4.16b, v4.16b, v9.16b + shl v9.2d, v9.2d, #2 + eor v5.16b, v5.16b, v16.16b + shl v16.2d, v16.2d, #2 + eor v2.16b, v2.16b, v17.16b + shl v17.2d, v17.2d, #2 + eor v1.16b, v1.16b, v8.16b + eor v0.16b, v0.16b, v9.16b + eor v7.16b, v7.16b, v16.16b + eor v3.16b, v3.16b, v17.16b + ushr v8.2d, v6.2d, #4 + ushr v9.2d, v4.2d, #4 + ushr v16.2d, v1.2d, #4 + ushr v17.2d, v0.2d, #4 + eor v8.16b, v8.16b, v5.16b + eor v9.16b, v9.16b, v2.16b + eor v16.16b, v16.16b, v7.16b + eor v17.16b, v17.16b, v3.16b + and v8.16b, v8.16b, v19.16b + and v9.16b, v9.16b, v19.16b + and v16.16b, v16.16b, v19.16b + and v17.16b, v17.16b, v19.16b + eor v5.16b, v5.16b, v8.16b + shl v8.2d, v8.2d, #4 + eor v2.16b, v2.16b, v9.16b + shl v9.2d, v9.2d, #4 + eor v7.16b, v7.16b, v16.16b + shl v16.2d, v16.2d, #4 + eor v3.16b, v3.16b, v17.16b + shl v17.2d, v17.2d, #4 + eor v6.16b, v6.16b, v8.16b + eor v4.16b, v4.16b, v9.16b + eor v7.16b, v7.16b, v10.16b + eor v1.16b, v1.16b, v16.16b + eor v3.16b, v3.16b, v10.16b + eor v0.16b, v0.16b, v17.16b + eor v6.16b, v6.16b, v10.16b + eor v4.16b, v4.16b, v10.16b + eor v2.16b, v2.16b, v10.16b + eor v5.16b, v5.16b, v10.16b + eor v1.16b, v1.16b, v10.16b + eor v0.16b, v0.16b, v10.16b + ret +.size _bsaes_encrypt8,.-_bsaes_encrypt8 + +.type _bsaes_key_convert,%function +.align 4 +// On entry: +// x9 -> input key (big-endian) +// x10 = number of rounds +// x17 -> output key (native endianness) +// On exit: +// x9, x10 corrupted +// x11 -> .LM0_bigendian +// x17 -> last quadword of output key +// other general-purpose registers preserved +// v2-v6 preserved +// v7.16b[] = 0x63 +// v8-v14 preserved +// v15 = last round key (converted to native endianness) +// other SIMD registers corrupted +_bsaes_key_convert: +#ifdef __AARCH64EL__ + adr x11, .LM0_littleendian +#else + adr x11, .LM0_bigendian +#endif + ldr q0, [x9], #16 // load round 0 key + ldr q1, [x11] // .LM0 + ldr q15, [x9], #16 // load round 1 key + + movi v7.16b, #0x63 // compose .L63 + movi v16.16b, #0x01 // bit masks + movi v17.16b, #0x02 + movi v18.16b, #0x04 + movi v19.16b, #0x08 + movi v20.16b, #0x10 + movi v21.16b, #0x20 + movi v22.16b, #0x40 + movi v23.16b, #0x80 + +#ifdef __AARCH64EL__ + rev32 v0.16b, v0.16b +#endif + sub x10, x10, #1 + str q0, [x17], #16 // save round 0 key + +.align 4 +.Lkey_loop: + tbl v0.16b, {v15.16b}, v1.16b + ldr q15, [x9], #16 // load next round key + + eor v0.16b, v0.16b, v7.16b + cmtst v24.16b, v0.16b, v16.16b + cmtst v25.16b, v0.16b, v17.16b + cmtst v26.16b, v0.16b, v18.16b + cmtst v27.16b, v0.16b, v19.16b + cmtst v28.16b, v0.16b, v20.16b + cmtst v29.16b, v0.16b, v21.16b + cmtst v30.16b, v0.16b, v22.16b + cmtst v31.16b, v0.16b, v23.16b + sub x10, x10, #1 + st1 {v24.16b,v25.16b,v26.16b,v27.16b}, [x17], #64 // write bit-sliced round key + st1 {v28.16b,v29.16b,v30.16b,v31.16b}, [x17], #64 + cbnz x10, .Lkey_loop + + // don't save last round key +#ifdef __AARCH64EL__ + rev32 v15.16b, v15.16b + adr x11, .LM0_bigendian +#endif + ret +.size _bsaes_key_convert,.-_bsaes_key_convert + +.globl ossl_bsaes_cbc_encrypt +.type ossl_bsaes_cbc_encrypt,%function +.align 4 +// On entry: +// x0 -> input ciphertext +// x1 -> output plaintext +// x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16) +// x3 -> key +// x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call) +// w5 must be == 0 +// On exit: +// Output plaintext filled in +// Initialisation vector overwritten with last quadword of ciphertext +// No output registers, usual AAPCS64 register preservation +ossl_bsaes_cbc_encrypt: + AARCH64_VALID_CALL_TARGET + cmp x2, #128 + bhs .Lcbc_do_bsaes + b AES_cbc_encrypt +.Lcbc_do_bsaes: + + // it is up to the caller to make sure we are called with enc == 0 + + stp x29, x30, [sp, #-48]! + stp d8, d9, [sp, #16] + stp d10, d15, [sp, #32] + lsr x2, x2, #4 // len in 16 byte blocks + + ldr w15, [x3, #240] // get # of rounds + mov x14, sp + + // allocate the key schedule on the stack + add x17, sp, #96 + sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes + + // populate the key schedule + mov x9, x3 // pass key + mov x10, x15 // pass # of rounds + mov sp, x17 // sp is sp + bl _bsaes_key_convert + ldr q6, [sp] + str q15, [x17] // save last round key + eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) + str q6, [sp] + + ldr q15, [x4] // load IV + b .Lcbc_dec_loop + +.align 4 +.Lcbc_dec_loop: + subs x2, x2, #0x8 + bmi .Lcbc_dec_loop_finish + + ldr q0, [x0], #16 // load input + mov x9, sp // pass the key + ldr q1, [x0], #16 + mov x10, x15 + ldr q2, [x0], #16 + ldr q3, [x0], #16 + ldr q4, [x0], #16 + ldr q5, [x0], #16 + ldr q6, [x0], #16 + ldr q7, [x0], #-7*16 + + bl _bsaes_decrypt8 + + ldr q16, [x0], #16 // reload input + eor v0.16b, v0.16b, v15.16b // ^= IV + eor v1.16b, v1.16b, v16.16b + str q0, [x1], #16 // write output + ldr q0, [x0], #16 + str q1, [x1], #16 + ldr q1, [x0], #16 + eor v1.16b, v4.16b, v1.16b + ldr q4, [x0], #16 + eor v2.16b, v2.16b, v4.16b + eor v0.16b, v6.16b, v0.16b + ldr q4, [x0], #16 + str q0, [x1], #16 + str q1, [x1], #16 + eor v0.16b, v7.16b, v4.16b + ldr q1, [x0], #16 + str q2, [x1], #16 + ldr q2, [x0], #16 + ldr q15, [x0], #16 + str q0, [x1], #16 + eor v0.16b, v5.16b, v2.16b + eor v1.16b, v3.16b, v1.16b + str q1, [x1], #16 + str q0, [x1], #16 + + b .Lcbc_dec_loop + +.Lcbc_dec_loop_finish: + adds x2, x2, #8 + beq .Lcbc_dec_done + + ldr q0, [x0], #16 // load input + cmp x2, #2 + blo .Lcbc_dec_one + ldr q1, [x0], #16 + mov x9, sp // pass the key + mov x10, x15 + beq .Lcbc_dec_two + ldr q2, [x0], #16 + cmp x2, #4 + blo .Lcbc_dec_three + ldr q3, [x0], #16 + beq .Lcbc_dec_four + ldr q4, [x0], #16 + cmp x2, #6 + blo .Lcbc_dec_five + ldr q5, [x0], #16 + beq .Lcbc_dec_six + ldr q6, [x0], #-6*16 + + bl _bsaes_decrypt8 + + ldr q5, [x0], #16 // reload input + eor v0.16b, v0.16b, v15.16b // ^= IV + ldr q8, [x0], #16 + ldr q9, [x0], #16 + ldr q10, [x0], #16 + str q0, [x1], #16 // write output + ldr q0, [x0], #16 + eor v1.16b, v1.16b, v5.16b + ldr q5, [x0], #16 + eor v6.16b, v6.16b, v8.16b + ldr q15, [x0] + eor v4.16b, v4.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + str q1, [x1], #16 + eor v0.16b, v7.16b, v0.16b + str q6, [x1], #16 + eor v1.16b, v3.16b, v5.16b + str q4, [x1], #16 + str q2, [x1], #16 + str q0, [x1], #16 + str q1, [x1] + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_six: + sub x0, x0, #0x60 + bl _bsaes_decrypt8 + ldr q3, [x0], #16 // reload input + eor v0.16b, v0.16b, v15.16b // ^= IV + ldr q5, [x0], #16 + ldr q8, [x0], #16 + ldr q9, [x0], #16 + str q0, [x1], #16 // write output + ldr q0, [x0], #16 + eor v1.16b, v1.16b, v3.16b + ldr q15, [x0] + eor v3.16b, v6.16b, v5.16b + eor v4.16b, v4.16b, v8.16b + eor v2.16b, v2.16b, v9.16b + str q1, [x1], #16 + eor v0.16b, v7.16b, v0.16b + str q3, [x1], #16 + str q4, [x1], #16 + str q2, [x1], #16 + str q0, [x1] + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_five: + sub x0, x0, #0x50 + bl _bsaes_decrypt8 + ldr q3, [x0], #16 // reload input + eor v0.16b, v0.16b, v15.16b // ^= IV + ldr q5, [x0], #16 + ldr q7, [x0], #16 + ldr q8, [x0], #16 + str q0, [x1], #16 // write output + ldr q15, [x0] + eor v0.16b, v1.16b, v3.16b + eor v1.16b, v6.16b, v5.16b + eor v3.16b, v4.16b, v7.16b + str q0, [x1], #16 + eor v0.16b, v2.16b, v8.16b + str q1, [x1], #16 + str q3, [x1], #16 + str q0, [x1] + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_four: + sub x0, x0, #0x40 + bl _bsaes_decrypt8 + ldr q2, [x0], #16 // reload input + eor v0.16b, v0.16b, v15.16b // ^= IV + ldr q3, [x0], #16 + ldr q5, [x0], #16 + str q0, [x1], #16 // write output + ldr q15, [x0] + eor v0.16b, v1.16b, v2.16b + eor v1.16b, v6.16b, v3.16b + eor v2.16b, v4.16b, v5.16b + str q0, [x1], #16 + str q1, [x1], #16 + str q2, [x1] + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_three: + sub x0, x0, #0x30 + bl _bsaes_decrypt8 + ldr q2, [x0], #16 // reload input + eor v0.16b, v0.16b, v15.16b // ^= IV + ldr q3, [x0], #16 + ldr q15, [x0] + str q0, [x1], #16 // write output + eor v0.16b, v1.16b, v2.16b + eor v1.16b, v6.16b, v3.16b + str q0, [x1], #16 + str q1, [x1] + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_two: + sub x0, x0, #0x20 + bl _bsaes_decrypt8 + ldr q2, [x0], #16 // reload input + eor v0.16b, v0.16b, v15.16b // ^= IV + ldr q15, [x0] + str q0, [x1], #16 // write output + eor v0.16b, v1.16b, v2.16b + str q0, [x1] + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_one: + sub x0, x0, #0x10 + stp x1, x4, [sp, #-32]! + str x14, [sp, #16] + mov v8.16b, v15.16b + mov v15.16b, v0.16b + mov x2, x3 + bl AES_decrypt + ldr x14, [sp, #16] + ldp x1, x4, [sp], #32 + ldr q0, [x1] // load result + eor v0.16b, v0.16b, v8.16b // ^= IV + str q0, [x1] // write output + +.align 4 +.Lcbc_dec_done: + movi v0.16b, #0 + movi v1.16b, #0 +.Lcbc_dec_bzero: // wipe key schedule [if any] + stp q0, q1, [sp], #32 + cmp sp, x14 + bne .Lcbc_dec_bzero + str q15, [x4] // return IV + ldp d8, d9, [sp, #16] + ldp d10, d15, [sp, #32] + ldp x29, x30, [sp], #48 + ret +.size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt + +.globl ossl_bsaes_ctr32_encrypt_blocks +.type ossl_bsaes_ctr32_encrypt_blocks,%function +.align 4 +// On entry: +// x0 -> input text (whole 16-byte blocks) +// x1 -> output text (whole 16-byte blocks) +// x2 = number of 16-byte blocks to encrypt/decrypt (> 0) +// x3 -> key +// x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block +// On exit: +// Output text filled in +// No output registers, usual AAPCS64 register preservation +ossl_bsaes_ctr32_encrypt_blocks: + AARCH64_VALID_CALL_TARGET + cmp x2, #8 // use plain AES for + blo .Lctr_enc_short // small sizes + + stp x29, x30, [sp, #-80]! + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + + ldr w15, [x3, #240] // get # of rounds + mov x14, sp + + // allocate the key schedule on the stack + add x17, sp, #96 + sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes + + // populate the key schedule + mov x9, x3 // pass key + mov x10, x15 // pass # of rounds + mov sp, x17 // sp is sp + bl _bsaes_key_convert + eor v7.16b, v7.16b, v15.16b // fix up last round key + str q7, [x17] // save last round key + + ldr q0, [x4] // load counter + add x13, x11, #.LREVM0SR-.LM0_bigendian + ldr q4, [sp] // load round0 key + + movi v8.4s, #1 // compose 1<<96 + movi v9.16b, #0 + rev32 v15.16b, v0.16b + rev32 v0.16b, v0.16b + ext v11.16b, v9.16b, v8.16b, #4 + rev32 v4.16b, v4.16b + add v12.4s, v11.4s, v11.4s // compose 2<<96 + str q4, [sp] // save adjusted round0 key + add v13.4s, v11.4s, v12.4s // compose 3<<96 + add v14.4s, v12.4s, v12.4s // compose 4<<96 + b .Lctr_enc_loop + +.align 4 +.Lctr_enc_loop: + // Intermix prologue from _bsaes_encrypt8 to use the opportunity + // to flip byte order in 32-bit counter + + add v1.4s, v15.4s, v11.4s // +1 + add x9, sp, #0x10 // pass next round key + add v2.4s, v15.4s, v12.4s // +2 + ldr q9, [x13] // .LREVM0SR + ldr q8, [sp] // load round0 key + add v3.4s, v15.4s, v13.4s // +3 + mov x10, x15 // pass rounds + sub x11, x13, #.LREVM0SR-.LSR // pass constants + add v6.4s, v2.4s, v14.4s + add v4.4s, v15.4s, v14.4s // +4 + add v7.4s, v3.4s, v14.4s + add v15.4s, v4.4s, v14.4s // next counter + add v5.4s, v1.4s, v14.4s + + bl _bsaes_encrypt8_alt + + subs x2, x2, #8 + blo .Lctr_enc_loop_done + + ldr q16, [x0], #16 + ldr q17, [x0], #16 + eor v1.16b, v1.16b, v17.16b + ldr q17, [x0], #16 + eor v0.16b, v0.16b, v16.16b + eor v4.16b, v4.16b, v17.16b + str q0, [x1], #16 + ldr q16, [x0], #16 + str q1, [x1], #16 + mov v0.16b, v15.16b + str q4, [x1], #16 + ldr q1, [x0], #16 + eor v4.16b, v6.16b, v16.16b + eor v1.16b, v3.16b, v1.16b + ldr q3, [x0], #16 + eor v3.16b, v7.16b, v3.16b + ldr q6, [x0], #16 + eor v2.16b, v2.16b, v6.16b + ldr q6, [x0], #16 + eor v5.16b, v5.16b, v6.16b + str q4, [x1], #16 + str q1, [x1], #16 + str q3, [x1], #16 + str q2, [x1], #16 + str q5, [x1], #16 + + bne .Lctr_enc_loop + b .Lctr_enc_done + +.align 4 +.Lctr_enc_loop_done: + add x2, x2, #8 + ldr q16, [x0], #16 // load input + eor v0.16b, v0.16b, v16.16b + str q0, [x1], #16 // write output + cmp x2, #2 + blo .Lctr_enc_done + ldr q17, [x0], #16 + eor v1.16b, v1.16b, v17.16b + str q1, [x1], #16 + beq .Lctr_enc_done + ldr q18, [x0], #16 + eor v4.16b, v4.16b, v18.16b + str q4, [x1], #16 + cmp x2, #4 + blo .Lctr_enc_done + ldr q19, [x0], #16 + eor v6.16b, v6.16b, v19.16b + str q6, [x1], #16 + beq .Lctr_enc_done + ldr q20, [x0], #16 + eor v3.16b, v3.16b, v20.16b + str q3, [x1], #16 + cmp x2, #6 + blo .Lctr_enc_done + ldr q21, [x0], #16 + eor v7.16b, v7.16b, v21.16b + str q7, [x1], #16 + beq .Lctr_enc_done + ldr q22, [x0] + eor v2.16b, v2.16b, v22.16b + str q2, [x1], #16 + +.Lctr_enc_done: + movi v0.16b, #0 + movi v1.16b, #0 +.Lctr_enc_bzero: // wipe key schedule [if any] + stp q0, q1, [sp], #32 + cmp sp, x14 + bne .Lctr_enc_bzero + + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] + ldp x29, x30, [sp], #80 + ret + +.Lctr_enc_short: + stp x29, x30, [sp, #-96]! + stp x19, x20, [sp, #16] + stp x21, x22, [sp, #32] + str x23, [sp, #48] + + mov x19, x0 // copy arguments + mov x20, x1 + mov x21, x2 + mov x22, x3 + ldr w23, [x4, #12] // load counter .LSW + ldr q1, [x4] // load whole counter value +#ifdef __AARCH64EL__ + rev w23, w23 +#endif + str q1, [sp, #80] // copy counter value + +.Lctr_enc_short_loop: + add x0, sp, #80 // input counter value + add x1, sp, #64 // output on the stack + mov x2, x22 // key + + bl AES_encrypt + + ldr q0, [x19], #16 // load input + ldr q1, [sp, #64] // load encrypted counter + add x23, x23, #1 +#ifdef __AARCH64EL__ + rev w0, w23 + str w0, [sp, #80+12] // next counter value +#else + str w23, [sp, #80+12] // next counter value +#endif + eor v0.16b, v0.16b, v1.16b + str q0, [x20], #16 // store output + subs x21, x21, #1 + bne .Lctr_enc_short_loop + + movi v0.16b, #0 + movi v1.16b, #0 + stp q0, q1, [sp, #64] + + ldr x23, [sp, #48] + ldp x21, x22, [sp, #32] + ldp x19, x20, [sp, #16] + ldp x29, x30, [sp], #96 + ret +.size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks + +.globl ossl_bsaes_xts_encrypt +.type ossl_bsaes_xts_encrypt,%function +.align 4 +// On entry: +// x0 -> input plaintext +// x1 -> output ciphertext +// x2 -> length of text in bytes (must be at least 16) +// x3 -> key1 (used to encrypt the XORed plaintext blocks) +// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) +// x5 -> 16-byte initial vector (typically, sector number) +// On exit: +// Output ciphertext filled in +// No output registers, usual AAPCS64 register preservation +ossl_bsaes_xts_encrypt: + AARCH64_VALID_CALL_TARGET + // Stack layout: + // sp -> + // nrounds*128-96 bytes: key schedule + // x19 -> + // 16 bytes: frame record + // 4*16 bytes: tweak storage across _bsaes_encrypt8 + // 6*8 bytes: storage for 5 callee-saved general-purpose registers + // 8*8 bytes: storage for 8 callee-saved SIMD registers + stp x29, x30, [sp, #-192]! + stp x19, x20, [sp, #80] + stp x21, x22, [sp, #96] + str x23, [sp, #112] + stp d8, d9, [sp, #128] + stp d10, d11, [sp, #144] + stp d12, d13, [sp, #160] + stp d14, d15, [sp, #176] + + mov x19, sp + mov x20, x0 + mov x21, x1 + mov x22, x2 + mov x23, x3 + + // generate initial tweak + sub sp, sp, #16 + mov x0, x5 // iv[] + mov x1, sp + mov x2, x4 // key2 + bl AES_encrypt + ldr q11, [sp], #16 + + ldr w1, [x23, #240] // get # of rounds + // allocate the key schedule on the stack + add x17, sp, #96 + sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes + + // populate the key schedule + mov x9, x23 // pass key + mov x10, x1 // pass # of rounds + mov sp, x17 + bl _bsaes_key_convert + eor v15.16b, v15.16b, v7.16b // fix up last round key + str q15, [x17] // save last round key + + subs x22, x22, #0x80 + blo .Lxts_enc_short + b .Lxts_enc_loop + +.align 4 +.Lxts_enc_loop: + ldr q8, .Lxts_magic + mov x10, x1 // pass rounds + add x2, x19, #16 + ldr q0, [x20], #16 + sshr v1.2d, v11.2d, #63 + mov x9, sp // pass key schedule + ldr q6, .Lxts_magic+16 + add v2.2d, v11.2d, v11.2d + cmtst v3.2d, v11.2d, v6.2d + and v1.16b, v1.16b, v8.16b + ext v1.16b, v1.16b, v1.16b, #8 + and v3.16b, v3.16b, v8.16b + ldr q4, [x20], #16 + eor v12.16b, v2.16b, v1.16b + eor v1.16b, v4.16b, v12.16b + eor v0.16b, v0.16b, v11.16b + cmtst v2.2d, v12.2d, v6.2d + add v4.2d, v12.2d, v12.2d + add x0, x19, #16 + ext v3.16b, v3.16b, v3.16b, #8 + and v2.16b, v2.16b, v8.16b + eor v13.16b, v4.16b, v3.16b + ldr q3, [x20], #16 + ext v4.16b, v2.16b, v2.16b, #8 + eor v2.16b, v3.16b, v13.16b + ldr q3, [x20], #16 + add v5.2d, v13.2d, v13.2d + cmtst v7.2d, v13.2d, v6.2d + and v7.16b, v7.16b, v8.16b + ldr q9, [x20], #16 + ext v7.16b, v7.16b, v7.16b, #8 + ldr q10, [x20], #16 + eor v14.16b, v5.16b, v4.16b + ldr q16, [x20], #16 + add v4.2d, v14.2d, v14.2d + eor v3.16b, v3.16b, v14.16b + eor v15.16b, v4.16b, v7.16b + add v5.2d, v15.2d, v15.2d + ldr q7, [x20], #16 + cmtst v4.2d, v14.2d, v6.2d + and v17.16b, v4.16b, v8.16b + cmtst v18.2d, v15.2d, v6.2d + eor v4.16b, v9.16b, v15.16b + ext v9.16b, v17.16b, v17.16b, #8 + eor v9.16b, v5.16b, v9.16b + add v17.2d, v9.2d, v9.2d + and v18.16b, v18.16b, v8.16b + eor v5.16b, v10.16b, v9.16b + str q9, [x2], #16 + ext v10.16b, v18.16b, v18.16b, #8 + cmtst v9.2d, v9.2d, v6.2d + and v9.16b, v9.16b, v8.16b + eor v10.16b, v17.16b, v10.16b + cmtst v17.2d, v10.2d, v6.2d + eor v6.16b, v16.16b, v10.16b + str q10, [x2], #16 + ext v9.16b, v9.16b, v9.16b, #8 + add v10.2d, v10.2d, v10.2d + eor v9.16b, v10.16b, v9.16b + str q9, [x2], #16 + eor v7.16b, v7.16b, v9.16b + add v9.2d, v9.2d, v9.2d + and v8.16b, v17.16b, v8.16b + ext v8.16b, v8.16b, v8.16b, #8 + eor v8.16b, v9.16b, v8.16b + str q8, [x2] // next round tweak + + bl _bsaes_encrypt8 + + ldr q8, [x0], #16 + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + ldr q9, [x0], #16 + eor v4.16b, v4.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + ldr q10, [x0], #16 + eor v3.16b, v3.16b, v15.16b + subs x22, x22, #0x80 + str q0, [x21], #16 + ldr q11, [x0] // next round tweak + str q1, [x21], #16 + eor v0.16b, v7.16b, v8.16b + eor v1.16b, v2.16b, v9.16b + str q4, [x21], #16 + eor v2.16b, v5.16b, v10.16b + str q6, [x21], #16 + str q3, [x21], #16 + str q0, [x21], #16 + str q1, [x21], #16 + str q2, [x21], #16 + bpl .Lxts_enc_loop + +.Lxts_enc_short: + adds x22, x22, #0x70 + bmi .Lxts_enc_done + + ldr q8, .Lxts_magic + sshr v1.2d, v11.2d, #63 + add v2.2d, v11.2d, v11.2d + ldr q9, .Lxts_magic+16 + subs x22, x22, #0x10 + ldr q0, [x20], #16 + and v1.16b, v1.16b, v8.16b + cmtst v3.2d, v11.2d, v9.2d + ext v1.16b, v1.16b, v1.16b, #8 + and v3.16b, v3.16b, v8.16b + eor v12.16b, v2.16b, v1.16b + ext v1.16b, v3.16b, v3.16b, #8 + add v2.2d, v12.2d, v12.2d + cmtst v3.2d, v12.2d, v9.2d + eor v13.16b, v2.16b, v1.16b + and v22.16b, v3.16b, v8.16b + bmi .Lxts_enc_1 + + ext v2.16b, v22.16b, v22.16b, #8 + add v3.2d, v13.2d, v13.2d + ldr q1, [x20], #16 + cmtst v4.2d, v13.2d, v9.2d + subs x22, x22, #0x10 + eor v14.16b, v3.16b, v2.16b + and v23.16b, v4.16b, v8.16b + bmi .Lxts_enc_2 + + ext v3.16b, v23.16b, v23.16b, #8 + add v4.2d, v14.2d, v14.2d + ldr q2, [x20], #16 + cmtst v5.2d, v14.2d, v9.2d + eor v0.16b, v0.16b, v11.16b + subs x22, x22, #0x10 + eor v15.16b, v4.16b, v3.16b + and v24.16b, v5.16b, v8.16b + bmi .Lxts_enc_3 + + ext v4.16b, v24.16b, v24.16b, #8 + add v5.2d, v15.2d, v15.2d + ldr q3, [x20], #16 + cmtst v6.2d, v15.2d, v9.2d + eor v1.16b, v1.16b, v12.16b + subs x22, x22, #0x10 + eor v16.16b, v5.16b, v4.16b + and v25.16b, v6.16b, v8.16b + bmi .Lxts_enc_4 + + ext v5.16b, v25.16b, v25.16b, #8 + add v6.2d, v16.2d, v16.2d + add x0, x19, #16 + cmtst v7.2d, v16.2d, v9.2d + ldr q4, [x20], #16 + eor v2.16b, v2.16b, v13.16b + str q16, [x0], #16 + subs x22, x22, #0x10 + eor v17.16b, v6.16b, v5.16b + and v26.16b, v7.16b, v8.16b + bmi .Lxts_enc_5 + + ext v7.16b, v26.16b, v26.16b, #8 + add v18.2d, v17.2d, v17.2d + ldr q5, [x20], #16 + eor v3.16b, v3.16b, v14.16b + str q17, [x0], #16 + subs x22, x22, #0x10 + eor v18.16b, v18.16b, v7.16b + bmi .Lxts_enc_6 + + ldr q6, [x20], #16 + eor v4.16b, v4.16b, v15.16b + eor v5.16b, v5.16b, v16.16b + str q18, [x0] // next round tweak + mov x9, sp // pass key schedule + mov x10, x1 + add x0, x19, #16 + sub x22, x22, #0x10 + eor v6.16b, v6.16b, v17.16b + + bl _bsaes_encrypt8 + + ldr q16, [x0], #16 + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + ldr q17, [x0], #16 + eor v4.16b, v4.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v3.16b, v3.16b, v15.16b + ldr q11, [x0] // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + eor v0.16b, v7.16b, v16.16b + eor v1.16b, v2.16b, v17.16b + str q4, [x21], #16 + str q6, [x21], #16 + str q3, [x21], #16 + str q0, [x21], #16 + str q1, [x21], #16 + b .Lxts_enc_done + +.align 4 +.Lxts_enc_6: + eor v4.16b, v4.16b, v15.16b + eor v5.16b, v5.16b, v16.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_encrypt8 + + ldr q16, [x0], #16 + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + eor v4.16b, v4.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + ldr q11, [x0] // next round tweak + eor v3.16b, v3.16b, v15.16b + str q0, [x21], #16 + str q1, [x21], #16 + eor v0.16b, v7.16b, v16.16b + str q4, [x21], #16 + str q6, [x21], #16 + str q3, [x21], #16 + str q0, [x21], #16 + b .Lxts_enc_done + +.align 4 +.Lxts_enc_5: + eor v3.16b, v3.16b, v14.16b + eor v4.16b, v4.16b, v15.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_encrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + ldr q11, [x0] // next round tweak + eor v4.16b, v4.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v3.16b, v3.16b, v15.16b + str q0, [x21], #16 + str q1, [x21], #16 + str q4, [x21], #16 + str q6, [x21], #16 + str q3, [x21], #16 + b .Lxts_enc_done + +.align 4 +.Lxts_enc_4: + eor v2.16b, v2.16b, v13.16b + eor v3.16b, v3.16b, v14.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_encrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + eor v4.16b, v4.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + mov v11.16b, v15.16b // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + str q4, [x21], #16 + str q6, [x21], #16 + b .Lxts_enc_done + +.align 4 +.Lxts_enc_3: + eor v1.16b, v1.16b, v12.16b + eor v2.16b, v2.16b, v13.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_encrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + eor v4.16b, v4.16b, v13.16b + mov v11.16b, v14.16b // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + str q4, [x21], #16 + b .Lxts_enc_done + +.align 4 +.Lxts_enc_2: + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_encrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + mov v11.16b, v13.16b // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + b .Lxts_enc_done + +.align 4 +.Lxts_enc_1: + eor v0.16b, v0.16b, v11.16b + sub x0, sp, #16 + sub x1, sp, #16 + mov x2, x23 + mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers + mov v14.d[0], v12.d[1] + str q0, [sp, #-16]! + + bl AES_encrypt + + ldr q0, [sp], #16 + trn1 v13.2d, v11.2d, v13.2d + trn1 v11.2d, v12.2d, v14.2d // next round tweak + eor v0.16b, v0.16b, v13.16b + str q0, [x21], #16 + +.Lxts_enc_done: + adds x22, x22, #0x10 + beq .Lxts_enc_ret + + sub x6, x21, #0x10 + // Penultimate plaintext block produces final ciphertext part-block + // plus remaining part of final plaintext block. Move ciphertext part + // to final position and reuse penultimate ciphertext block buffer to + // construct final plaintext block +.Lxts_enc_steal: + ldrb w0, [x20], #1 + ldrb w1, [x21, #-0x10] + strb w0, [x21, #-0x10] + strb w1, [x21], #1 + + subs x22, x22, #1 + bhi .Lxts_enc_steal + + // Finally encrypt the penultimate ciphertext block using the + // last tweak + ldr q0, [x6] + eor v0.16b, v0.16b, v11.16b + str q0, [sp, #-16]! + mov x0, sp + mov x1, sp + mov x2, x23 + mov x21, x6 + mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers + + bl AES_encrypt + + trn1 v11.2d, v11.2d, v13.2d + ldr q0, [sp], #16 + eor v0.16b, v0.16b, v11.16b + str q0, [x21] + +.Lxts_enc_ret: + + movi v0.16b, #0 + movi v1.16b, #0 +.Lxts_enc_bzero: // wipe key schedule + stp q0, q1, [sp], #32 + cmp sp, x19 + bne .Lxts_enc_bzero + + ldp x19, x20, [sp, #80] + ldp x21, x22, [sp, #96] + ldr x23, [sp, #112] + ldp d8, d9, [sp, #128] + ldp d10, d11, [sp, #144] + ldp d12, d13, [sp, #160] + ldp d14, d15, [sp, #176] + ldp x29, x30, [sp], #192 + ret +.size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt + +// The assembler doesn't seem capable of de-duplicating these when expressed +// using `ldr qd,=` syntax, so assign a symbolic address +.align 5 +.Lxts_magic: +.quad 1, 0x87, 0x4000000000000000, 0x4000000000000000 + +.globl ossl_bsaes_xts_decrypt +.type ossl_bsaes_xts_decrypt,%function +.align 4 +// On entry: +// x0 -> input ciphertext +// x1 -> output plaintext +// x2 -> length of text in bytes (must be at least 16) +// x3 -> key1 (used to decrypt the XORed ciphertext blocks) +// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) +// x5 -> 16-byte initial vector (typically, sector number) +// On exit: +// Output plaintext filled in +// No output registers, usual AAPCS64 register preservation +ossl_bsaes_xts_decrypt: + AARCH64_VALID_CALL_TARGET + // Stack layout: + // sp -> + // nrounds*128-96 bytes: key schedule + // x19 -> + // 16 bytes: frame record + // 4*16 bytes: tweak storage across _bsaes_decrypt8 + // 6*8 bytes: storage for 5 callee-saved general-purpose registers + // 8*8 bytes: storage for 8 callee-saved SIMD registers + stp x29, x30, [sp, #-192]! + stp x19, x20, [sp, #80] + stp x21, x22, [sp, #96] + str x23, [sp, #112] + stp d8, d9, [sp, #128] + stp d10, d11, [sp, #144] + stp d12, d13, [sp, #160] + stp d14, d15, [sp, #176] + + mov x19, sp + mov x20, x0 + mov x21, x1 + mov x22, x2 + mov x23, x3 + + // generate initial tweak + sub sp, sp, #16 + mov x0, x5 // iv[] + mov x1, sp + mov x2, x4 // key2 + bl AES_encrypt + ldr q11, [sp], #16 + + ldr w1, [x23, #240] // get # of rounds + // allocate the key schedule on the stack + add x17, sp, #96 + sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes + + // populate the key schedule + mov x9, x23 // pass key + mov x10, x1 // pass # of rounds + mov sp, x17 + bl _bsaes_key_convert + ldr q6, [sp] + str q15, [x17] // save last round key + eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) + str q6, [sp] + + sub x30, x22, #0x10 + tst x22, #0xf // if not multiple of 16 + csel x22, x30, x22, ne // subtract another 16 bytes + subs x22, x22, #0x80 + + blo .Lxts_dec_short + b .Lxts_dec_loop + +.align 4 +.Lxts_dec_loop: + ldr q8, .Lxts_magic + mov x10, x1 // pass rounds + add x2, x19, #16 + ldr q0, [x20], #16 + sshr v1.2d, v11.2d, #63 + mov x9, sp // pass key schedule + ldr q6, .Lxts_magic+16 + add v2.2d, v11.2d, v11.2d + cmtst v3.2d, v11.2d, v6.2d + and v1.16b, v1.16b, v8.16b + ext v1.16b, v1.16b, v1.16b, #8 + and v3.16b, v3.16b, v8.16b + ldr q4, [x20], #16 + eor v12.16b, v2.16b, v1.16b + eor v1.16b, v4.16b, v12.16b + eor v0.16b, v0.16b, v11.16b + cmtst v2.2d, v12.2d, v6.2d + add v4.2d, v12.2d, v12.2d + add x0, x19, #16 + ext v3.16b, v3.16b, v3.16b, #8 + and v2.16b, v2.16b, v8.16b + eor v13.16b, v4.16b, v3.16b + ldr q3, [x20], #16 + ext v4.16b, v2.16b, v2.16b, #8 + eor v2.16b, v3.16b, v13.16b + ldr q3, [x20], #16 + add v5.2d, v13.2d, v13.2d + cmtst v7.2d, v13.2d, v6.2d + and v7.16b, v7.16b, v8.16b + ldr q9, [x20], #16 + ext v7.16b, v7.16b, v7.16b, #8 + ldr q10, [x20], #16 + eor v14.16b, v5.16b, v4.16b + ldr q16, [x20], #16 + add v4.2d, v14.2d, v14.2d + eor v3.16b, v3.16b, v14.16b + eor v15.16b, v4.16b, v7.16b + add v5.2d, v15.2d, v15.2d + ldr q7, [x20], #16 + cmtst v4.2d, v14.2d, v6.2d + and v17.16b, v4.16b, v8.16b + cmtst v18.2d, v15.2d, v6.2d + eor v4.16b, v9.16b, v15.16b + ext v9.16b, v17.16b, v17.16b, #8 + eor v9.16b, v5.16b, v9.16b + add v17.2d, v9.2d, v9.2d + and v18.16b, v18.16b, v8.16b + eor v5.16b, v10.16b, v9.16b + str q9, [x2], #16 + ext v10.16b, v18.16b, v18.16b, #8 + cmtst v9.2d, v9.2d, v6.2d + and v9.16b, v9.16b, v8.16b + eor v10.16b, v17.16b, v10.16b + cmtst v17.2d, v10.2d, v6.2d + eor v6.16b, v16.16b, v10.16b + str q10, [x2], #16 + ext v9.16b, v9.16b, v9.16b, #8 + add v10.2d, v10.2d, v10.2d + eor v9.16b, v10.16b, v9.16b + str q9, [x2], #16 + eor v7.16b, v7.16b, v9.16b + add v9.2d, v9.2d, v9.2d + and v8.16b, v17.16b, v8.16b + ext v8.16b, v8.16b, v8.16b, #8 + eor v8.16b, v9.16b, v8.16b + str q8, [x2] // next round tweak + + bl _bsaes_decrypt8 + + eor v6.16b, v6.16b, v13.16b + eor v0.16b, v0.16b, v11.16b + ldr q8, [x0], #16 + eor v7.16b, v7.16b, v8.16b + str q0, [x21], #16 + eor v0.16b, v1.16b, v12.16b + ldr q1, [x0], #16 + eor v1.16b, v3.16b, v1.16b + subs x22, x22, #0x80 + eor v2.16b, v2.16b, v15.16b + eor v3.16b, v4.16b, v14.16b + ldr q4, [x0], #16 + str q0, [x21], #16 + ldr q11, [x0] // next round tweak + eor v0.16b, v5.16b, v4.16b + str q6, [x21], #16 + str q3, [x21], #16 + str q2, [x21], #16 + str q7, [x21], #16 + str q1, [x21], #16 + str q0, [x21], #16 + bpl .Lxts_dec_loop + +.Lxts_dec_short: + adds x22, x22, #0x70 + bmi .Lxts_dec_done + + ldr q8, .Lxts_magic + sshr v1.2d, v11.2d, #63 + add v2.2d, v11.2d, v11.2d + ldr q9, .Lxts_magic+16 + subs x22, x22, #0x10 + ldr q0, [x20], #16 + and v1.16b, v1.16b, v8.16b + cmtst v3.2d, v11.2d, v9.2d + ext v1.16b, v1.16b, v1.16b, #8 + and v3.16b, v3.16b, v8.16b + eor v12.16b, v2.16b, v1.16b + ext v1.16b, v3.16b, v3.16b, #8 + add v2.2d, v12.2d, v12.2d + cmtst v3.2d, v12.2d, v9.2d + eor v13.16b, v2.16b, v1.16b + and v22.16b, v3.16b, v8.16b + bmi .Lxts_dec_1 + + ext v2.16b, v22.16b, v22.16b, #8 + add v3.2d, v13.2d, v13.2d + ldr q1, [x20], #16 + cmtst v4.2d, v13.2d, v9.2d + subs x22, x22, #0x10 + eor v14.16b, v3.16b, v2.16b + and v23.16b, v4.16b, v8.16b + bmi .Lxts_dec_2 + + ext v3.16b, v23.16b, v23.16b, #8 + add v4.2d, v14.2d, v14.2d + ldr q2, [x20], #16 + cmtst v5.2d, v14.2d, v9.2d + eor v0.16b, v0.16b, v11.16b + subs x22, x22, #0x10 + eor v15.16b, v4.16b, v3.16b + and v24.16b, v5.16b, v8.16b + bmi .Lxts_dec_3 + + ext v4.16b, v24.16b, v24.16b, #8 + add v5.2d, v15.2d, v15.2d + ldr q3, [x20], #16 + cmtst v6.2d, v15.2d, v9.2d + eor v1.16b, v1.16b, v12.16b + subs x22, x22, #0x10 + eor v16.16b, v5.16b, v4.16b + and v25.16b, v6.16b, v8.16b + bmi .Lxts_dec_4 + + ext v5.16b, v25.16b, v25.16b, #8 + add v6.2d, v16.2d, v16.2d + add x0, x19, #16 + cmtst v7.2d, v16.2d, v9.2d + ldr q4, [x20], #16 + eor v2.16b, v2.16b, v13.16b + str q16, [x0], #16 + subs x22, x22, #0x10 + eor v17.16b, v6.16b, v5.16b + and v26.16b, v7.16b, v8.16b + bmi .Lxts_dec_5 + + ext v7.16b, v26.16b, v26.16b, #8 + add v18.2d, v17.2d, v17.2d + ldr q5, [x20], #16 + eor v3.16b, v3.16b, v14.16b + str q17, [x0], #16 + subs x22, x22, #0x10 + eor v18.16b, v18.16b, v7.16b + bmi .Lxts_dec_6 + + ldr q6, [x20], #16 + eor v4.16b, v4.16b, v15.16b + eor v5.16b, v5.16b, v16.16b + str q18, [x0] // next round tweak + mov x9, sp // pass key schedule + mov x10, x1 + add x0, x19, #16 + sub x22, x22, #0x10 + eor v6.16b, v6.16b, v17.16b + + bl _bsaes_decrypt8 + + ldr q16, [x0], #16 + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + ldr q17, [x0], #16 + eor v6.16b, v6.16b, v13.16b + eor v4.16b, v4.16b, v14.16b + eor v2.16b, v2.16b, v15.16b + ldr q11, [x0] // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + eor v0.16b, v7.16b, v16.16b + eor v1.16b, v3.16b, v17.16b + str q6, [x21], #16 + str q4, [x21], #16 + str q2, [x21], #16 + str q0, [x21], #16 + str q1, [x21], #16 + b .Lxts_dec_done + +.align 4 +.Lxts_dec_6: + eor v4.16b, v4.16b, v15.16b + eor v5.16b, v5.16b, v16.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_decrypt8 + + ldr q16, [x0], #16 + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v4.16b, v4.16b, v14.16b + ldr q11, [x0] // next round tweak + eor v2.16b, v2.16b, v15.16b + str q0, [x21], #16 + str q1, [x21], #16 + eor v0.16b, v7.16b, v16.16b + str q6, [x21], #16 + str q4, [x21], #16 + str q2, [x21], #16 + str q0, [x21], #16 + b .Lxts_dec_done + +.align 4 +.Lxts_dec_5: + eor v3.16b, v3.16b, v14.16b + eor v4.16b, v4.16b, v15.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_decrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + ldr q11, [x0] // next round tweak + eor v6.16b, v6.16b, v13.16b + eor v4.16b, v4.16b, v14.16b + eor v2.16b, v2.16b, v15.16b + str q0, [x21], #16 + str q1, [x21], #16 + str q6, [x21], #16 + str q4, [x21], #16 + str q2, [x21], #16 + b .Lxts_dec_done + +.align 4 +.Lxts_dec_4: + eor v2.16b, v2.16b, v13.16b + eor v3.16b, v3.16b, v14.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_decrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v4.16b, v4.16b, v14.16b + mov v11.16b, v15.16b // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + str q6, [x21], #16 + str q4, [x21], #16 + b .Lxts_dec_done + +.align 4 +.Lxts_dec_3: + eor v1.16b, v1.16b, v12.16b + eor v2.16b, v2.16b, v13.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_decrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + mov v11.16b, v14.16b // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + str q6, [x21], #16 + b .Lxts_dec_done + +.align 4 +.Lxts_dec_2: + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + mov x9, sp // pass key schedule + mov x10, x1 // pass rounds + add x0, x19, #16 + + bl _bsaes_decrypt8 + + eor v0.16b, v0.16b, v11.16b + eor v1.16b, v1.16b, v12.16b + mov v11.16b, v13.16b // next round tweak + str q0, [x21], #16 + str q1, [x21], #16 + b .Lxts_dec_done + +.align 4 +.Lxts_dec_1: + eor v0.16b, v0.16b, v11.16b + sub x0, sp, #16 + sub x1, sp, #16 + mov x2, x23 + mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers + mov v14.d[0], v12.d[1] + str q0, [sp, #-16]! + + bl AES_decrypt + + ldr q0, [sp], #16 + trn1 v13.2d, v11.2d, v13.2d + trn1 v11.2d, v12.2d, v14.2d // next round tweak + eor v0.16b, v0.16b, v13.16b + str q0, [x21], #16 + +.Lxts_dec_done: + adds x22, x22, #0x10 + beq .Lxts_dec_ret + + // calculate one round of extra tweak for the stolen ciphertext + ldr q8, .Lxts_magic + sshr v6.2d, v11.2d, #63 + and v6.16b, v6.16b, v8.16b + add v12.2d, v11.2d, v11.2d + ext v6.16b, v6.16b, v6.16b, #8 + eor v12.16b, v12.16b, v6.16b + + // perform the final decryption with the last tweak value + ldr q0, [x20], #16 + eor v0.16b, v0.16b, v12.16b + str q0, [sp, #-16]! + mov x0, sp + mov x1, sp + mov x2, x23 + mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers + mov v14.d[0], v12.d[1] + + bl AES_decrypt + + trn1 v12.2d, v12.2d, v14.2d + trn1 v11.2d, v11.2d, v13.2d + ldr q0, [sp], #16 + eor v0.16b, v0.16b, v12.16b + str q0, [x21] + + mov x6, x21 + // Penultimate ciphertext block produces final plaintext part-block + // plus remaining part of final ciphertext block. Move plaintext part + // to final position and reuse penultimate plaintext block buffer to + // construct final ciphertext block +.Lxts_dec_steal: + ldrb w1, [x21] + ldrb w0, [x20], #1 + strb w1, [x21, #0x10] + strb w0, [x21], #1 + + subs x22, x22, #1 + bhi .Lxts_dec_steal + + // Finally decrypt the penultimate plaintext block using the + // penultimate tweak + ldr q0, [x6] + eor v0.16b, v0.16b, v11.16b + str q0, [sp, #-16]! + mov x0, sp + mov x1, sp + mov x2, x23 + mov x21, x6 + + bl AES_decrypt + + trn1 v11.2d, v11.2d, v13.2d + ldr q0, [sp], #16 + eor v0.16b, v0.16b, v11.16b + str q0, [x21] + +.Lxts_dec_ret: + + movi v0.16b, #0 + movi v1.16b, #0 +.Lxts_dec_bzero: // wipe key schedule + stp q0, q1, [sp], #32 + cmp sp, x19 + bne .Lxts_dec_bzero + + ldp x19, x20, [sp, #80] + ldp x21, x22, [sp, #96] + ldr x23, [sp, #112] + ldp d8, d9, [sp, #128] + ldp d10, d11, [sp, #144] + ldp d12, d13, [sp, #160] + ldp d14, d15, [sp, #176] + ldp x29, x30, [sp], #192 + ret +.size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/aes/vpaes-armv8.S b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/aes/vpaes-armv8.S index 229dad24d350..9aef5acd861b 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/aes/vpaes-armv8.S +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/aes/vpaes-armv8.S @@ -1,3 +1,5 @@ +#include "arm_arch.h" + .text .type _vpaes_consts,%object @@ -195,7 +197,7 @@ _vpaes_encrypt_core: .type vpaes_encrypt,%function .align 4 vpaes_encrypt: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -205,7 +207,7 @@ vpaes_encrypt: st1 {v0.16b}, [x1] ldp x29,x30,[sp],#16 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_encrypt,.-vpaes_encrypt @@ -428,7 +430,7 @@ _vpaes_decrypt_core: .type vpaes_decrypt,%function .align 4 vpaes_decrypt: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -438,7 +440,7 @@ vpaes_decrypt: st1 {v0.16b}, [x1] ldp x29,x30,[sp],#16 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_decrypt,.-vpaes_decrypt @@ -602,7 +604,7 @@ _vpaes_key_preheat: .type _vpaes_schedule_core,%function .align 4 _vpaes_schedule_core: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp,#-16]! add x29,sp,#0 @@ -767,7 +769,7 @@ _vpaes_schedule_core: eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 ldp x29, x30, [sp],#16 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size _vpaes_schedule_core,.-_vpaes_schedule_core @@ -980,7 +982,7 @@ _vpaes_schedule_mangle: .type vpaes_set_encrypt_key,%function .align 4 vpaes_set_encrypt_key: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 stp d8,d9,[sp,#-16]! // ABI spec says so @@ -996,7 +998,7 @@ vpaes_set_encrypt_key: ldp d8,d9,[sp],#16 ldp x29,x30,[sp],#16 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key @@ -1004,7 +1006,7 @@ vpaes_set_encrypt_key: .type vpaes_set_decrypt_key,%function .align 4 vpaes_set_decrypt_key: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 stp d8,d9,[sp,#-16]! // ABI spec says so @@ -1024,18 +1026,18 @@ vpaes_set_decrypt_key: ldp d8,d9,[sp],#16 ldp x29,x30,[sp],#16 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key .globl vpaes_cbc_encrypt .type vpaes_cbc_encrypt,%function .align 4 vpaes_cbc_encrypt: + AARCH64_SIGN_LINK_REGISTER cbz x2, .Lcbc_abort cmp w5, #0 // check direction b.eq vpaes_cbc_decrypt -.inst 0xd503233f // paciasp stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -1058,15 +1060,16 @@ vpaes_cbc_encrypt: st1 {v0.16b}, [x4] // write ivec ldp x29,x30,[sp],#16 -.inst 0xd50323bf // autiasp .Lcbc_abort: + AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt .type vpaes_cbc_decrypt,%function .align 4 vpaes_cbc_decrypt: -.inst 0xd503233f // paciasp + // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to + // only from vpaes_cbc_encrypt which has already signed the return address. stp x29,x30,[sp,#-16]! add x29,sp,#0 stp d8,d9,[sp,#-16]! // ABI spec says so @@ -1108,14 +1111,14 @@ vpaes_cbc_decrypt: ldp d10,d11,[sp],#16 ldp d8,d9,[sp],#16 ldp x29,x30,[sp],#16 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt .globl vpaes_ecb_encrypt .type vpaes_ecb_encrypt,%function .align 4 vpaes_ecb_encrypt: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 stp d8,d9,[sp,#-16]! // ABI spec says so @@ -1149,7 +1152,7 @@ vpaes_ecb_encrypt: ldp d10,d11,[sp],#16 ldp d8,d9,[sp],#16 ldp x29,x30,[sp],#16 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt @@ -1157,7 +1160,7 @@ vpaes_ecb_encrypt: .type vpaes_ecb_decrypt,%function .align 4 vpaes_ecb_decrypt: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 stp d8,d9,[sp,#-16]! // ABI spec says so @@ -1191,6 +1194,6 @@ vpaes_ecb_decrypt: ldp d10,d11,[sp],#16 ldp d8,d9,[sp],#16 ldp x29,x30,[sp],#16 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/arm64cpuid.S b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/arm64cpuid.S index 297d5075f95d..2ca3b8d86709 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/arm64cpuid.S +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/arm64cpuid.S @@ -7,6 +7,7 @@ .globl _armv7_neon_probe .type _armv7_neon_probe,%function _armv7_neon_probe: + AARCH64_VALID_CALL_TARGET orr v15.16b, v15.16b, v15.16b ret .size _armv7_neon_probe,.-_armv7_neon_probe @@ -14,6 +15,7 @@ _armv7_neon_probe: .globl _armv7_tick .type _armv7_tick,%function _armv7_tick: + AARCH64_VALID_CALL_TARGET #ifdef __APPLE__ mrs x0, CNTPCT_EL0 #else @@ -25,6 +27,7 @@ _armv7_tick: .globl _armv8_aes_probe .type _armv8_aes_probe,%function _armv8_aes_probe: + AARCH64_VALID_CALL_TARGET aese v0.16b, v0.16b ret .size _armv8_aes_probe,.-_armv8_aes_probe @@ -32,6 +35,7 @@ _armv8_aes_probe: .globl _armv8_sha1_probe .type _armv8_sha1_probe,%function _armv8_sha1_probe: + AARCH64_VALID_CALL_TARGET sha1h s0, s0 ret .size _armv8_sha1_probe,.-_armv8_sha1_probe @@ -39,6 +43,7 @@ _armv8_sha1_probe: .globl _armv8_sha256_probe .type _armv8_sha256_probe,%function _armv8_sha256_probe: + AARCH64_VALID_CALL_TARGET sha256su0 v0.4s, v0.4s ret .size _armv8_sha256_probe,.-_armv8_sha256_probe @@ -46,28 +51,72 @@ _armv8_sha256_probe: .globl _armv8_pmull_probe .type _armv8_pmull_probe,%function _armv8_pmull_probe: + AARCH64_VALID_CALL_TARGET pmull v0.1q, v0.1d, v0.1d ret .size _armv8_pmull_probe,.-_armv8_pmull_probe +.globl _armv8_sm4_probe +.type _armv8_sm4_probe,%function +_armv8_sm4_probe: + AARCH64_VALID_CALL_TARGET +.inst 0xcec08400 // sm4e v0.4s, v0.4s + ret +.size _armv8_sm4_probe,.-_armv8_sm4_probe + .globl _armv8_sha512_probe .type _armv8_sha512_probe,%function _armv8_sha512_probe: -.long 0xcec08000 // sha512su0 v0.2d,v0.2d + AARCH64_VALID_CALL_TARGET +.inst 0xcec08000 // sha512su0 v0.2d,v0.2d ret .size _armv8_sha512_probe,.-_armv8_sha512_probe +.globl _armv8_eor3_probe +.type _armv8_eor3_probe,%function +_armv8_eor3_probe: + AARCH64_VALID_CALL_TARGET +.inst 0xce010800 // eor3 v0.16b, v0.16b, v1.16b, v2.16b + ret +.size _armv8_eor3_probe,.-_armv8_eor3_probe + +.globl _armv8_sve_probe +.type _armv8_sve_probe,%function +_armv8_sve_probe: + AARCH64_VALID_CALL_TARGET +.inst 0x04a03000 // eor z0.d,z0.d,z0.d + ret +.size _armv8_sve_probe,.-_armv8_sve_probe + +.globl _armv8_sve2_probe +.type _armv8_sve2_probe,%function +_armv8_sve2_probe: + AARCH64_VALID_CALL_TARGET +.inst 0x04e03400 // xar z0.d,z0.d,z0.d + ret +.size _armv8_sve2_probe,.-_armv8_sve2_probe + .globl _armv8_cpuid_probe .type _armv8_cpuid_probe,%function _armv8_cpuid_probe: + AARCH64_VALID_CALL_TARGET mrs x0, midr_el1 ret .size _armv8_cpuid_probe,.-_armv8_cpuid_probe +.globl _armv8_sm3_probe +.type _armv8_sm3_probe,%function +_armv8_sm3_probe: + AARCH64_VALID_CALL_TARGET +.inst 0xce63c004 // sm3partw1 v4.4s, v0.4s, v3.4s + ret +.size _armv8_sm3_probe,.-_armv8_sm3_probe + .globl OPENSSL_cleanse .type OPENSSL_cleanse,%function .align 5 OPENSSL_cleanse: + AARCH64_VALID_CALL_TARGET cbz x1,.Lret // len==0? cmp x1,#15 b.hi .Lot // len>15 @@ -99,6 +148,7 @@ OPENSSL_cleanse: .type CRYPTO_memcmp,%function .align 4 CRYPTO_memcmp: + AARCH64_VALID_CALL_TARGET eor w3,w3,w3 cbz x2,.Lno_data // len==0? cmp x2,#16 @@ -127,3 +177,98 @@ CRYPTO_memcmp: lsr w0,w0,#31 ret .size CRYPTO_memcmp,.-CRYPTO_memcmp + +.globl _armv8_rng_probe +.type _armv8_rng_probe,%function +_armv8_rng_probe: + AARCH64_VALID_CALL_TARGET + mrs x0, s3_3_c2_c4_0 // rndr + mrs x0, s3_3_c2_c4_1 // rndrrs + ret +.size _armv8_rng_probe,.-_armv8_rng_probe +// Fill buffer with Randomly Generated Bytes +// inputs: char * in x0 - Pointer to buffer +// size_t in x1 - Number of bytes to write to buffer +// outputs: size_t in x0 - Number of bytes successfully written to buffer +.globl OPENSSL_rndr_asm +.type OPENSSL_rndr_asm,%function +.align 4 +OPENSSL_rndr_asm: + AARCH64_VALID_CALL_TARGET + mov x2,xzr + mov x3,xzr + +.align 4 +.Loop_rndr: + cmp x1,#0 + b.eq .rndr_done + mov x3,xzr + mrs x3,s3_3_c2_c4_0 + b.eq .rndr_done + + cmp x1,#8 + b.lt .Loop_single_byte_rndr + + str x3,[x0] + add x0,x0,#8 + add x2,x2,#8 + subs x1,x1,#8 + b.ge .Loop_rndr + +.align 4 +.Loop_single_byte_rndr: + strb w3,[x0] + lsr x3,x3,#8 + add x2,x2,#1 + add x0,x0,#1 + subs x1,x1,#1 + b.gt .Loop_single_byte_rndr + +.align 4 +.rndr_done: + mov x0,x2 + ret +.size OPENSSL_rndr_asm,.-OPENSSL_rndr_asm +// Fill buffer with Randomly Generated Bytes +// inputs: char * in x0 - Pointer to buffer +// size_t in x1 - Number of bytes to write to buffer +// outputs: size_t in x0 - Number of bytes successfully written to buffer +.globl OPENSSL_rndrrs_asm +.type OPENSSL_rndrrs_asm,%function +.align 4 +OPENSSL_rndrrs_asm: + AARCH64_VALID_CALL_TARGET + mov x2,xzr + mov x3,xzr + +.align 4 +.Loop_rndrrs: + cmp x1,#0 + b.eq .rndrrs_done + mov x3,xzr + mrs x3,s3_3_c2_c4_1 + b.eq .rndrrs_done + + cmp x1,#8 + b.lt .Loop_single_byte_rndrrs + + str x3,[x0] + add x0,x0,#8 + add x2,x2,#8 + subs x1,x1,#8 + b.ge .Loop_rndrrs + +.align 4 +.Loop_single_byte_rndrrs: + strb w3,[x0] + lsr x3,x3,#8 + add x2,x2,#1 + add x0,x0,#1 + subs x1,x1,#1 + b.gt .Loop_single_byte_rndrrs + +.align 4 +.rndrrs_done: + mov x0,x2 + ret +.size OPENSSL_rndrrs_asm,.-OPENSSL_rndrrs_asm diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/bn/armv8-mont.S b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/bn/armv8-mont.S index 7448af982c99..ceb58c50c9e3 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/bn/armv8-mont.S +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/bn/armv8-mont.S @@ -1,5 +1,5 @@ +#include "arm_arch.h" #ifndef __KERNEL__ -# include "arm_arch.h" .hidden OPENSSL_armv8_rsa_neonized #endif @@ -9,6 +9,7 @@ .type bn_mul_mont,%function .align 5 bn_mul_mont: + AARCH64_SIGN_LINK_REGISTER .Lbn_mul_mont: tst x5,#3 b.ne .Lmul_mont @@ -219,11 +220,14 @@ bn_mul_mont: mov x0,#1 ldp x23,x24,[x29,#48] ldr x29,[sp],#64 + AARCH64_VALIDATE_LINK_REGISTER ret .size bn_mul_mont,.-bn_mul_mont .type bn_mul8x_mont_neon,%function .align 5 bn_mul8x_mont_neon: + // Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to + // only from bn_mul_mont which has already signed the return address. stp x29,x30,[sp,#-80]! mov x16,sp stp d8,d9,[sp,#16] @@ -916,6 +920,7 @@ bn_mul8x_mont_neon: ldp d10,d11,[sp,#32] ldp d8,d9,[sp,#16] ldr x29,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER ret // bx lr .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon @@ -925,7 +930,8 @@ __bn_sqr8x_mont: cmp x1,x2 b.ne __bn_mul4x_mont .Lsqr8x_mont: -.inst 0xd503233f // paciasp + // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to + // only from bn_mul_mont which has already signed the return address. stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -1676,13 +1682,15 @@ __bn_sqr8x_mont: ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldr x29,[sp],#128 -.inst 0xd50323bf // autiasp + // x30 is loaded earlier + AARCH64_VALIDATE_LINK_REGISTER ret .size __bn_sqr8x_mont,.-__bn_sqr8x_mont .type __bn_mul4x_mont,%function .align 5 __bn_mul4x_mont: -.inst 0xd503233f // paciasp + // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to + // only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address. stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -2116,7 +2124,8 @@ __bn_mul4x_mont: ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldr x29,[sp],#128 -.inst 0xd50323bf // autiasp + // x30 loaded earlier + AARCH64_VALIDATE_LINK_REGISTER ret .size __bn_mul4x_mont,.-__bn_mul4x_mont .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/ec/ecp_nistz256-armv8.S b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/ec/ecp_nistz256-armv8.S index efd46af5e434..6fe86a40201c 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/ec/ecp_nistz256-armv8.S +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/ec/ecp_nistz256-armv8.S @@ -2395,7 +2395,7 @@ ecp_nistz256_precomputed: .type ecp_nistz256_to_mont,%function .align 6 ecp_nistz256_to_mont: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -2411,7 +2411,7 @@ ecp_nistz256_to_mont: ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont @@ -2420,7 +2420,7 @@ ecp_nistz256_to_mont: .type ecp_nistz256_from_mont,%function .align 4 ecp_nistz256_from_mont: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -2436,7 +2436,7 @@ ecp_nistz256_from_mont: ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont @@ -2446,7 +2446,7 @@ ecp_nistz256_from_mont: .type ecp_nistz256_mul_mont,%function .align 4 ecp_nistz256_mul_mont: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -2461,7 +2461,7 @@ ecp_nistz256_mul_mont: ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont @@ -2470,7 +2470,7 @@ ecp_nistz256_mul_mont: .type ecp_nistz256_sqr_mont,%function .align 4 ecp_nistz256_sqr_mont: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -2484,7 +2484,7 @@ ecp_nistz256_sqr_mont: ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont @@ -2494,7 +2494,7 @@ ecp_nistz256_sqr_mont: .type ecp_nistz256_add,%function .align 4 ecp_nistz256_add: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -2508,7 +2508,7 @@ ecp_nistz256_add: bl __ecp_nistz256_add ldp x29,x30,[sp],#16 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_add,.-ecp_nistz256_add @@ -2517,7 +2517,7 @@ ecp_nistz256_add: .type ecp_nistz256_div_by_2,%function .align 4 ecp_nistz256_div_by_2: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -2529,7 +2529,7 @@ ecp_nistz256_div_by_2: bl __ecp_nistz256_div_by_2 ldp x29,x30,[sp],#16 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 @@ -2538,7 +2538,7 @@ ecp_nistz256_div_by_2: .type ecp_nistz256_mul_by_2,%function .align 4 ecp_nistz256_mul_by_2: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -2554,7 +2554,7 @@ ecp_nistz256_mul_by_2: bl __ecp_nistz256_add // ret = a+a // 2*a ldp x29,x30,[sp],#16 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 @@ -2563,7 +2563,7 @@ ecp_nistz256_mul_by_2: .type ecp_nistz256_mul_by_3,%function .align 4 ecp_nistz256_mul_by_3: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -2590,7 +2590,7 @@ ecp_nistz256_mul_by_3: bl __ecp_nistz256_add // ret += a // 2*a+a=3*a ldp x29,x30,[sp],#16 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 @@ -2600,7 +2600,7 @@ ecp_nistz256_mul_by_3: .type ecp_nistz256_sub,%function .align 4 ecp_nistz256_sub: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -2612,7 +2612,7 @@ ecp_nistz256_sub: bl __ecp_nistz256_sub_from ldp x29,x30,[sp],#16 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_sub,.-ecp_nistz256_sub @@ -2621,7 +2621,7 @@ ecp_nistz256_sub: .type ecp_nistz256_neg,%function .align 4 ecp_nistz256_neg: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -2636,7 +2636,7 @@ ecp_nistz256_neg: bl __ecp_nistz256_sub_from ldp x29,x30,[sp],#16 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_neg,.-ecp_nistz256_neg @@ -3014,7 +3014,7 @@ __ecp_nistz256_div_by_2: .type ecp_nistz256_point_double,%function .align 5 ecp_nistz256_point_double: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -3149,14 +3149,14 @@ ecp_nistz256_point_double: ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x29,x30,[sp],#96 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_point_double,.-ecp_nistz256_point_double .globl ecp_nistz256_point_add .type ecp_nistz256_point_add,%function .align 5 ecp_nistz256_point_add: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -3403,14 +3403,14 @@ ecp_nistz256_point_add: ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_point_add,.-ecp_nistz256_point_add .globl ecp_nistz256_point_add_affine .type ecp_nistz256_point_add_affine,%function .align 5 ecp_nistz256_point_add_affine: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-80]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -3609,7 +3609,7 @@ ecp_nistz256_point_add_affine: ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x29,x30,[sp],#80 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine //////////////////////////////////////////////////////////////////////// @@ -3619,6 +3619,8 @@ ecp_nistz256_point_add_affine: .type ecp_nistz256_ord_mul_mont,%function .align 4 ecp_nistz256_ord_mul_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -3827,6 +3829,8 @@ ecp_nistz256_ord_mul_mont: .type ecp_nistz256_ord_sqr_mont,%function .align 4 ecp_nistz256_ord_sqr_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -4015,6 +4019,8 @@ ecp_nistz256_ord_sqr_mont: .type ecp_nistz256_scatter_w5,%function .align 4 ecp_nistz256_scatter_w5: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -4077,6 +4083,8 @@ ecp_nistz256_scatter_w5: .type ecp_nistz256_gather_w5,%function .align 4 ecp_nistz256_gather_w5: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -4154,6 +4162,8 @@ ecp_nistz256_gather_w5: .type ecp_nistz256_scatter_w7,%function .align 4 ecp_nistz256_scatter_w7: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -4198,6 +4208,8 @@ ecp_nistz256_scatter_w7: .type ecp_nistz256_gather_w7,%function .align 4 ecp_nistz256_gather_w7: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-16]! add x29,sp,#0 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/md5/md5-aarch64.S b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/md5/md5-aarch64.S new file mode 100644 index 000000000000..7045e31f189f --- /dev/null +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/md5/md5-aarch64.S @@ -0,0 +1,677 @@ +#include "arm_arch.h" + +.text +.globl ossl_md5_block_asm_data_order +.type ossl_md5_block_asm_data_order,@function +ossl_md5_block_asm_data_order: + AARCH64_VALID_CALL_TARGET + // Save all callee-saved registers + stp x19,x20,[sp,#-80]! + stp x21,x22,[sp,#16] + stp x23,x24,[sp,#32] + stp x25,x26,[sp,#48] + stp x27,x28,[sp,#64] + + ldp w10, w11, [x0, #0] // .Load MD5 state->A and state->B + ldp w12, w13, [x0, #8] // .Load MD5 state->C and state->D +.align 5 +ossl_md5_blocks_loop: + eor x17, x12, x13 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + and x16, x17, x11 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + ldp w15, w20, [x1] // .Load 2 words of input data0 M[0],M[1] + ldp w3, w21, [x1, #8] // .Load 2 words of input data0 M[2],M[3] +#ifdef __AARCH64EB__ + rev w15, w15 + rev w20, w20 + rev w3, w3 + rev w21, w21 +#endif + eor x14, x16, x13 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x9, #0xa478 // .Load lower half of constant 0xd76aa478 + movk x9, #0xd76a, lsl #16 // .Load upper half of constant 0xd76aa478 + add w8, w10, w15 // Add dest value + add w7, w8, w9 // Add constant 0xd76aa478 + add w6, w7, w14 // Add aux function result + ror w6, w6, #25 // Rotate left s=7 bits + eor x5, x11, x12 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w4, w11, w6 // Add X parameter round 1 A=FF(A, B, C, D, 0xd76aa478, s=7, M[0]) + and x8, x5, x4 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x17, x8, x12 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x16, #0xb756 // .Load lower half of constant 0xe8c7b756 + movk x16, #0xe8c7, lsl #16 // .Load upper half of constant 0xe8c7b756 + add w9, w13, w20 // Add dest value + add w7, w9, w16 // Add constant 0xe8c7b756 + add w14, w7, w17 // Add aux function result + ror w14, w14, #20 // Rotate left s=12 bits + eor x6, x4, x11 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w5, w4, w14 // Add X parameter round 1 D=FF(D, A, B, C, 0xe8c7b756, s=12, M[1]) + and x8, x6, x5 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x9, x8, x11 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x16, #0x70db // .Load lower half of constant 0x242070db + movk x16, #0x2420, lsl #16 // .Load upper half of constant 0x242070db + add w7, w12, w3 // Add dest value + add w17, w7, w16 // Add constant 0x242070db + add w14, w17, w9 // Add aux function result + ror w14, w14, #15 // Rotate left s=17 bits + eor x6, x5, x4 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w8, w5, w14 // Add X parameter round 1 C=FF(C, D, A, B, 0x242070db, s=17, M[2]) + and x7, x6, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x16, x7, x4 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x9, #0xceee // .Load lower half of constant 0xc1bdceee + movk x9, #0xc1bd, lsl #16 // .Load upper half of constant 0xc1bdceee + add w14, w11, w21 // Add dest value + add w6, w14, w9 // Add constant 0xc1bdceee + add w7, w6, w16 // Add aux function result + ror w7, w7, #10 // Rotate left s=22 bits + eor x17, x8, x5 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w9, w8, w7 // Add X parameter round 1 B=FF(B, C, D, A, 0xc1bdceee, s=22, M[3]) + ldp w14, w22, [x1, #16] // .Load 2 words of input data0 M[4],M[5] + ldp w7, w23, [x1, #24] // .Load 2 words of input data0 M[6],M[7] +#ifdef __AARCH64EB__ + rev w14, w14 + rev w22, w22 + rev w7, w7 + rev w23, w23 +#endif + and x16, x17, x9 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x6, x16, x5 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x16, #0xfaf // .Load lower half of constant 0xf57c0faf + movk x16, #0xf57c, lsl #16 // .Load upper half of constant 0xf57c0faf + add w17, w4, w14 // Add dest value + add w16, w17, w16 // Add constant 0xf57c0faf + add w4, w16, w6 // Add aux function result + ror w4, w4, #25 // Rotate left s=7 bits + eor x16, x9, x8 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w17, w9, w4 // Add X parameter round 1 A=FF(A, B, C, D, 0xf57c0faf, s=7, M[4]) + and x16, x16, x17 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x6, x16, x8 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x4, #0xc62a // .Load lower half of constant 0x4787c62a + movk x4, #0x4787, lsl #16 // .Load upper half of constant 0x4787c62a + add w16, w5, w22 // Add dest value + add w16, w16, w4 // Add constant 0x4787c62a + add w5, w16, w6 // Add aux function result + ror w5, w5, #20 // Rotate left s=12 bits + eor x4, x17, x9 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w19, w17, w5 // Add X parameter round 1 D=FF(D, A, B, C, 0x4787c62a, s=12, M[5]) + and x6, x4, x19 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x5, x6, x9 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x4, #0x4613 // .Load lower half of constant 0xa8304613 + movk x4, #0xa830, lsl #16 // .Load upper half of constant 0xa8304613 + add w6, w8, w7 // Add dest value + add w8, w6, w4 // Add constant 0xa8304613 + add w4, w8, w5 // Add aux function result + ror w4, w4, #15 // Rotate left s=17 bits + eor x6, x19, x17 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w8, w19, w4 // Add X parameter round 1 C=FF(C, D, A, B, 0xa8304613, s=17, M[6]) + and x5, x6, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x4, x5, x17 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x6, #0x9501 // .Load lower half of constant 0xfd469501 + movk x6, #0xfd46, lsl #16 // .Load upper half of constant 0xfd469501 + add w9, w9, w23 // Add dest value + add w5, w9, w6 // Add constant 0xfd469501 + add w9, w5, w4 // Add aux function result + ror w9, w9, #10 // Rotate left s=22 bits + eor x6, x8, x19 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w4, w8, w9 // Add X parameter round 1 B=FF(B, C, D, A, 0xfd469501, s=22, M[7]) + ldp w5, w24, [x1, #32] // .Load 2 words of input data0 M[8],M[9] + ldp w16, w25, [x1, #40] // .Load 2 words of input data0 M[10],M[11] +#ifdef __AARCH64EB__ + rev w5, w5 + rev w24, w24 + rev w16, w16 + rev w25, w25 +#endif + and x9, x6, x4 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x6, x9, x19 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x9, #0x98d8 // .Load lower half of constant 0x698098d8 + movk x9, #0x6980, lsl #16 // .Load upper half of constant 0x698098d8 + add w17, w17, w5 // Add dest value + add w9, w17, w9 // Add constant 0x698098d8 + add w17, w9, w6 // Add aux function result + ror w17, w17, #25 // Rotate left s=7 bits + eor x9, x4, x8 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w6, w4, w17 // Add X parameter round 1 A=FF(A, B, C, D, 0x698098d8, s=7, M[8]) + and x17, x9, x6 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x9, x17, x8 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x17, #0xf7af // .Load lower half of constant 0x8b44f7af + movk x17, #0x8b44, lsl #16 // .Load upper half of constant 0x8b44f7af + add w19, w19, w24 // Add dest value + add w17, w19, w17 // Add constant 0x8b44f7af + add w19, w17, w9 // Add aux function result + ror w19, w19, #20 // Rotate left s=12 bits + eor x9, x6, x4 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w17, w6, w19 // Add X parameter round 1 D=FF(D, A, B, C, 0x8b44f7af, s=12, M[9]) + and x9, x9, x17 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x9, x9, x4 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x11, #0x5bb1 // .Load lower half of constant 0xffff5bb1 + movk x11, #0xffff, lsl #16 // .Load upper half of constant 0xffff5bb1 + add w8, w8, w16 // Add dest value + add w8, w8, w11 // Add constant 0xffff5bb1 + add w8, w8, w9 // Add aux function result + ror w8, w8, #15 // Rotate left s=17 bits + eor x9, x17, x6 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w8, w17, w8 // Add X parameter round 1 C=FF(C, D, A, B, 0xffff5bb1, s=17, M[10]) + and x9, x9, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x9, x9, x6 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x11, #0xd7be // .Load lower half of constant 0x895cd7be + movk x11, #0x895c, lsl #16 // .Load upper half of constant 0x895cd7be + add w4, w4, w25 // Add dest value + add w4, w4, w11 // Add constant 0x895cd7be + add w9, w4, w9 // Add aux function result + ror w9, w9, #10 // Rotate left s=22 bits + eor x4, x8, x17 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w9, w8, w9 // Add X parameter round 1 B=FF(B, C, D, A, 0x895cd7be, s=22, M[11]) + ldp w11, w26, [x1, #48] // .Load 2 words of input data0 M[12],M[13] + ldp w12, w27, [x1, #56] // .Load 2 words of input data0 M[14],M[15] +#ifdef __AARCH64EB__ + rev w11, w11 + rev w26, w26 + rev w12, w12 + rev w27, w27 +#endif + and x4, x4, x9 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x4, x4, x17 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x19, #0x1122 // .Load lower half of constant 0x6b901122 + movk x19, #0x6b90, lsl #16 // .Load upper half of constant 0x6b901122 + add w6, w6, w11 // Add dest value + add w6, w6, w19 // Add constant 0x6b901122 + add w4, w6, w4 // Add aux function result + ror w4, w4, #25 // Rotate left s=7 bits + eor x6, x9, x8 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w4, w9, w4 // Add X parameter round 1 A=FF(A, B, C, D, 0x6b901122, s=7, M[12]) + and x6, x6, x4 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x6, x6, x8 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x19, #0x7193 // .Load lower half of constant 0xfd987193 + movk x19, #0xfd98, lsl #16 // .Load upper half of constant 0xfd987193 + add w17, w17, w26 // Add dest value + add w17, w17, w19 // Add constant 0xfd987193 + add w17, w17, w6 // Add aux function result + ror w17, w17, #20 // Rotate left s=12 bits + eor x6, x4, x9 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w17, w4, w17 // Add X parameter round 1 D=FF(D, A, B, C, 0xfd987193, s=12, M[13]) + and x6, x6, x17 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x6, x6, x9 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x13, #0x438e // .Load lower half of constant 0xa679438e + movk x13, #0xa679, lsl #16 // .Load upper half of constant 0xa679438e + add w8, w8, w12 // Add dest value + add w8, w8, w13 // Add constant 0xa679438e + add w8, w8, w6 // Add aux function result + ror w8, w8, #15 // Rotate left s=17 bits + eor x6, x17, x4 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w8, w17, w8 // Add X parameter round 1 C=FF(C, D, A, B, 0xa679438e, s=17, M[14]) + and x6, x6, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x6, x6, x4 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x13, #0x821 // .Load lower half of constant 0x49b40821 + movk x13, #0x49b4, lsl #16 // .Load upper half of constant 0x49b40821 + add w9, w9, w27 // Add dest value + add w9, w9, w13 // Add constant 0x49b40821 + add w9, w9, w6 // Add aux function result + ror w9, w9, #10 // Rotate left s=22 bits + bic x6, x8, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w9, w8, w9 // Add X parameter round 1 B=FF(B, C, D, A, 0x49b40821, s=22, M[15]) + and x13, x9, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0x2562 // .Load lower half of constant 0xf61e2562 + movk x13, #0xf61e, lsl #16 // .Load upper half of constant 0xf61e2562 + add w4, w4, w20 // Add dest value + add w4, w4, w13 // Add constant 0xf61e2562 + add w4, w4, w6 // Add aux function result + ror w4, w4, #27 // Rotate left s=5 bits + bic x6, x9, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xf61e2562, s=5, M[1]) + and x13, x4, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0xb340 // .Load lower half of constant 0xc040b340 + movk x13, #0xc040, lsl #16 // .Load upper half of constant 0xc040b340 + add w17, w17, w7 // Add dest value + add w17, w17, w13 // Add constant 0xc040b340 + add w17, w17, w6 // Add aux function result + ror w17, w17, #23 // Rotate left s=9 bits + bic x6, x4, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xc040b340, s=9, M[6]) + and x13, x17, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0x5a51 // .Load lower half of constant 0x265e5a51 + movk x13, #0x265e, lsl #16 // .Load upper half of constant 0x265e5a51 + add w8, w8, w25 // Add dest value + add w8, w8, w13 // Add constant 0x265e5a51 + add w8, w8, w6 // Add aux function result + ror w8, w8, #18 // Rotate left s=14 bits + bic x6, x17, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0x265e5a51, s=14, M[11]) + and x13, x8, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0xc7aa // .Load lower half of constant 0xe9b6c7aa + movk x13, #0xe9b6, lsl #16 // .Load upper half of constant 0xe9b6c7aa + add w9, w9, w15 // Add dest value + add w9, w9, w13 // Add constant 0xe9b6c7aa + add w9, w9, w6 // Add aux function result + ror w9, w9, #12 // Rotate left s=20 bits + bic x6, x8, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0xe9b6c7aa, s=20, M[0]) + and x13, x9, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0x105d // .Load lower half of constant 0xd62f105d + movk x13, #0xd62f, lsl #16 // .Load upper half of constant 0xd62f105d + add w4, w4, w22 // Add dest value + add w4, w4, w13 // Add constant 0xd62f105d + add w4, w4, w6 // Add aux function result + ror w4, w4, #27 // Rotate left s=5 bits + bic x6, x9, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xd62f105d, s=5, M[5]) + and x13, x4, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0x1453 // .Load lower half of constant 0x2441453 + movk x13, #0x244, lsl #16 // .Load upper half of constant 0x2441453 + add w17, w17, w16 // Add dest value + add w17, w17, w13 // Add constant 0x2441453 + add w17, w17, w6 // Add aux function result + ror w17, w17, #23 // Rotate left s=9 bits + bic x6, x4, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0x2441453, s=9, M[10]) + and x13, x17, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0xe681 // .Load lower half of constant 0xd8a1e681 + movk x13, #0xd8a1, lsl #16 // .Load upper half of constant 0xd8a1e681 + add w8, w8, w27 // Add dest value + add w8, w8, w13 // Add constant 0xd8a1e681 + add w8, w8, w6 // Add aux function result + ror w8, w8, #18 // Rotate left s=14 bits + bic x6, x17, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0xd8a1e681, s=14, M[15]) + and x13, x8, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0xfbc8 // .Load lower half of constant 0xe7d3fbc8 + movk x13, #0xe7d3, lsl #16 // .Load upper half of constant 0xe7d3fbc8 + add w9, w9, w14 // Add dest value + add w9, w9, w13 // Add constant 0xe7d3fbc8 + add w9, w9, w6 // Add aux function result + ror w9, w9, #12 // Rotate left s=20 bits + bic x6, x8, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0xe7d3fbc8, s=20, M[4]) + and x13, x9, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0xcde6 // .Load lower half of constant 0x21e1cde6 + movk x13, #0x21e1, lsl #16 // .Load upper half of constant 0x21e1cde6 + add w4, w4, w24 // Add dest value + add w4, w4, w13 // Add constant 0x21e1cde6 + add w4, w4, w6 // Add aux function result + ror w4, w4, #27 // Rotate left s=5 bits + bic x6, x9, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0x21e1cde6, s=5, M[9]) + and x13, x4, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0x7d6 // .Load lower half of constant 0xc33707d6 + movk x13, #0xc337, lsl #16 // .Load upper half of constant 0xc33707d6 + add w17, w17, w12 // Add dest value + add w17, w17, w13 // Add constant 0xc33707d6 + add w17, w17, w6 // Add aux function result + ror w17, w17, #23 // Rotate left s=9 bits + bic x6, x4, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xc33707d6, s=9, M[14]) + and x13, x17, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0xd87 // .Load lower half of constant 0xf4d50d87 + movk x13, #0xf4d5, lsl #16 // .Load upper half of constant 0xf4d50d87 + add w8, w8, w21 // Add dest value + add w8, w8, w13 // Add constant 0xf4d50d87 + add w8, w8, w6 // Add aux function result + ror w8, w8, #18 // Rotate left s=14 bits + bic x6, x17, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0xf4d50d87, s=14, M[3]) + and x13, x8, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0x14ed // .Load lower half of constant 0x455a14ed + movk x13, #0x455a, lsl #16 // .Load upper half of constant 0x455a14ed + add w9, w9, w5 // Add dest value + add w9, w9, w13 // Add constant 0x455a14ed + add w9, w9, w6 // Add aux function result + ror w9, w9, #12 // Rotate left s=20 bits + bic x6, x8, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0x455a14ed, s=20, M[8]) + and x13, x9, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0xe905 // .Load lower half of constant 0xa9e3e905 + movk x13, #0xa9e3, lsl #16 // .Load upper half of constant 0xa9e3e905 + add w4, w4, w26 // Add dest value + add w4, w4, w13 // Add constant 0xa9e3e905 + add w4, w4, w6 // Add aux function result + ror w4, w4, #27 // Rotate left s=5 bits + bic x6, x9, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xa9e3e905, s=5, M[13]) + and x13, x4, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0xa3f8 // .Load lower half of constant 0xfcefa3f8 + movk x13, #0xfcef, lsl #16 // .Load upper half of constant 0xfcefa3f8 + add w17, w17, w3 // Add dest value + add w17, w17, w13 // Add constant 0xfcefa3f8 + add w17, w17, w6 // Add aux function result + ror w17, w17, #23 // Rotate left s=9 bits + bic x6, x4, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xfcefa3f8, s=9, M[2]) + and x13, x17, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0x2d9 // .Load lower half of constant 0x676f02d9 + movk x13, #0x676f, lsl #16 // .Load upper half of constant 0x676f02d9 + add w8, w8, w23 // Add dest value + add w8, w8, w13 // Add constant 0x676f02d9 + add w8, w8, w6 // Add aux function result + ror w8, w8, #18 // Rotate left s=14 bits + bic x6, x17, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0x676f02d9, s=14, M[7]) + and x13, x8, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0x4c8a // .Load lower half of constant 0x8d2a4c8a + movk x13, #0x8d2a, lsl #16 // .Load upper half of constant 0x8d2a4c8a + add w9, w9, w11 // Add dest value + add w9, w9, w13 // Add constant 0x8d2a4c8a + add w9, w9, w6 // Add aux function result + eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w9, w9, #12 // Rotate left s=20 bits + movz x10, #0x3942 // .Load lower half of constant 0xfffa3942 + add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0x8d2a4c8a, s=20, M[12]) + movk x10, #0xfffa, lsl #16 // .Load upper half of constant 0xfffa3942 + add w4, w4, w22 // Add dest value + eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z) + add w4, w4, w10 // Add constant 0xfffa3942 + add w4, w4, w6 // Add aux function result + ror w4, w4, #28 // Rotate left s=4 bits + eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x10, #0xf681 // .Load lower half of constant 0x8771f681 + add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0xfffa3942, s=4, M[5]) + movk x10, #0x8771, lsl #16 // .Load upper half of constant 0x8771f681 + add w17, w17, w5 // Add dest value + eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z) + add w17, w17, w10 // Add constant 0x8771f681 + add w17, w17, w6 // Add aux function result + eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w17, w17, #21 // Rotate left s=11 bits + movz x13, #0x6122 // .Load lower half of constant 0x6d9d6122 + add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0x8771f681, s=11, M[8]) + movk x13, #0x6d9d, lsl #16 // .Load upper half of constant 0x6d9d6122 + add w8, w8, w25 // Add dest value + eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z) + add w8, w8, w13 // Add constant 0x6d9d6122 + add w8, w8, w6 // Add aux function result + ror w8, w8, #16 // Rotate left s=16 bits + eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x13, #0x380c // .Load lower half of constant 0xfde5380c + add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0x6d9d6122, s=16, M[11]) + movk x13, #0xfde5, lsl #16 // .Load upper half of constant 0xfde5380c + add w9, w9, w12 // Add dest value + eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z) + add w9, w9, w13 // Add constant 0xfde5380c + add w9, w9, w6 // Add aux function result + eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w9, w9, #9 // Rotate left s=23 bits + movz x10, #0xea44 // .Load lower half of constant 0xa4beea44 + add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0xfde5380c, s=23, M[14]) + movk x10, #0xa4be, lsl #16 // .Load upper half of constant 0xa4beea44 + add w4, w4, w20 // Add dest value + eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z) + add w4, w4, w10 // Add constant 0xa4beea44 + add w4, w4, w6 // Add aux function result + ror w4, w4, #28 // Rotate left s=4 bits + eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x10, #0xcfa9 // .Load lower half of constant 0x4bdecfa9 + add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0xa4beea44, s=4, M[1]) + movk x10, #0x4bde, lsl #16 // .Load upper half of constant 0x4bdecfa9 + add w17, w17, w14 // Add dest value + eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z) + add w17, w17, w10 // Add constant 0x4bdecfa9 + add w17, w17, w6 // Add aux function result + eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w17, w17, #21 // Rotate left s=11 bits + movz x13, #0x4b60 // .Load lower half of constant 0xf6bb4b60 + add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0x4bdecfa9, s=11, M[4]) + movk x13, #0xf6bb, lsl #16 // .Load upper half of constant 0xf6bb4b60 + add w8, w8, w23 // Add dest value + eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z) + add w8, w8, w13 // Add constant 0xf6bb4b60 + add w8, w8, w6 // Add aux function result + ror w8, w8, #16 // Rotate left s=16 bits + eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x13, #0xbc70 // .Load lower half of constant 0xbebfbc70 + add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0xf6bb4b60, s=16, M[7]) + movk x13, #0xbebf, lsl #16 // .Load upper half of constant 0xbebfbc70 + add w9, w9, w16 // Add dest value + eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z) + add w9, w9, w13 // Add constant 0xbebfbc70 + add w9, w9, w6 // Add aux function result + eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w9, w9, #9 // Rotate left s=23 bits + movz x10, #0x7ec6 // .Load lower half of constant 0x289b7ec6 + add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0xbebfbc70, s=23, M[10]) + movk x10, #0x289b, lsl #16 // .Load upper half of constant 0x289b7ec6 + add w4, w4, w26 // Add dest value + eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z) + add w4, w4, w10 // Add constant 0x289b7ec6 + add w4, w4, w6 // Add aux function result + ror w4, w4, #28 // Rotate left s=4 bits + eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x10, #0x27fa // .Load lower half of constant 0xeaa127fa + add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0x289b7ec6, s=4, M[13]) + movk x10, #0xeaa1, lsl #16 // .Load upper half of constant 0xeaa127fa + add w17, w17, w15 // Add dest value + eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z) + add w17, w17, w10 // Add constant 0xeaa127fa + add w17, w17, w6 // Add aux function result + eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w17, w17, #21 // Rotate left s=11 bits + movz x13, #0x3085 // .Load lower half of constant 0xd4ef3085 + add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0xeaa127fa, s=11, M[0]) + movk x13, #0xd4ef, lsl #16 // .Load upper half of constant 0xd4ef3085 + add w8, w8, w21 // Add dest value + eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z) + add w8, w8, w13 // Add constant 0xd4ef3085 + add w8, w8, w6 // Add aux function result + ror w8, w8, #16 // Rotate left s=16 bits + eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x13, #0x1d05 // .Load lower half of constant 0x4881d05 + add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0xd4ef3085, s=16, M[3]) + movk x13, #0x488, lsl #16 // .Load upper half of constant 0x4881d05 + add w9, w9, w7 // Add dest value + eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z) + add w9, w9, w13 // Add constant 0x4881d05 + add w9, w9, w6 // Add aux function result + eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w9, w9, #9 // Rotate left s=23 bits + movz x10, #0xd039 // .Load lower half of constant 0xd9d4d039 + add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0x4881d05, s=23, M[6]) + movk x10, #0xd9d4, lsl #16 // .Load upper half of constant 0xd9d4d039 + add w4, w4, w24 // Add dest value + eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z) + add w4, w4, w10 // Add constant 0xd9d4d039 + add w4, w4, w6 // Add aux function result + ror w4, w4, #28 // Rotate left s=4 bits + eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x10, #0x99e5 // .Load lower half of constant 0xe6db99e5 + add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0xd9d4d039, s=4, M[9]) + movk x10, #0xe6db, lsl #16 // .Load upper half of constant 0xe6db99e5 + add w17, w17, w11 // Add dest value + eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z) + add w17, w17, w10 // Add constant 0xe6db99e5 + add w17, w17, w6 // Add aux function result + eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w17, w17, #21 // Rotate left s=11 bits + movz x13, #0x7cf8 // .Load lower half of constant 0x1fa27cf8 + add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0xe6db99e5, s=11, M[12]) + movk x13, #0x1fa2, lsl #16 // .Load upper half of constant 0x1fa27cf8 + add w8, w8, w27 // Add dest value + eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z) + add w8, w8, w13 // Add constant 0x1fa27cf8 + add w8, w8, w6 // Add aux function result + ror w8, w8, #16 // Rotate left s=16 bits + eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x13, #0x5665 // .Load lower half of constant 0xc4ac5665 + add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0x1fa27cf8, s=16, M[15]) + movk x13, #0xc4ac, lsl #16 // .Load upper half of constant 0xc4ac5665 + add w9, w9, w3 // Add dest value + eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z) + add w9, w9, w13 // Add constant 0xc4ac5665 + add w9, w9, w6 // Add aux function result + ror w9, w9, #9 // Rotate left s=23 bits + movz x6, #0x2244 // .Load lower half of constant 0xf4292244 + movk x6, #0xf429, lsl #16 // .Load upper half of constant 0xf4292244 + add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0xc4ac5665, s=23, M[2]) + add w4, w4, w15 // Add dest value + orn x13, x9, x17 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w4, w4, w6 // Add constant 0xf4292244 + eor x6, x8, x13 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w4, w4, w6 // Add aux function result + ror w4, w4, #26 // Rotate left s=6 bits + movz x6, #0xff97 // .Load lower half of constant 0x432aff97 + movk x6, #0x432a, lsl #16 // .Load upper half of constant 0x432aff97 + add w4, w9, w4 // Add X parameter round 4 A=II(A, B, C, D, 0xf4292244, s=6, M[0]) + orn x10, x4, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w17, w17, w23 // Add dest value + eor x10, x9, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w17, w17, w6 // Add constant 0x432aff97 + add w6, w17, w10 // Add aux function result + ror w6, w6, #22 // Rotate left s=10 bits + movz x17, #0x23a7 // .Load lower half of constant 0xab9423a7 + movk x17, #0xab94, lsl #16 // .Load upper half of constant 0xab9423a7 + add w6, w4, w6 // Add X parameter round 4 D=II(D, A, B, C, 0x432aff97, s=10, M[7]) + add w8, w8, w12 // Add dest value + orn x10, x6, x9 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w8, w8, w17 // Add constant 0xab9423a7 + eor x17, x4, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w8, w8, w17 // Add aux function result + ror w8, w8, #17 // Rotate left s=15 bits + movz x17, #0xa039 // .Load lower half of constant 0xfc93a039 + movk x17, #0xfc93, lsl #16 // .Load upper half of constant 0xfc93a039 + add w8, w6, w8 // Add X parameter round 4 C=II(C, D, A, B, 0xab9423a7, s=15, M[14]) + orn x13, x8, x4 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w9, w9, w22 // Add dest value + eor x13, x6, x13 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w9, w9, w17 // Add constant 0xfc93a039 + add w17, w9, w13 // Add aux function result + ror w17, w17, #11 // Rotate left s=21 bits + movz x9, #0x59c3 // .Load lower half of constant 0x655b59c3 + movk x9, #0x655b, lsl #16 // .Load upper half of constant 0x655b59c3 + add w17, w8, w17 // Add X parameter round 4 B=II(B, C, D, A, 0xfc93a039, s=21, M[5]) + add w4, w4, w11 // Add dest value + orn x13, x17, x6 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w9, w4, w9 // Add constant 0x655b59c3 + eor x4, x8, x13 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w9, w9, w4 // Add aux function result + ror w9, w9, #26 // Rotate left s=6 bits + movz x4, #0xcc92 // .Load lower half of constant 0x8f0ccc92 + movk x4, #0x8f0c, lsl #16 // .Load upper half of constant 0x8f0ccc92 + add w9, w17, w9 // Add X parameter round 4 A=II(A, B, C, D, 0x655b59c3, s=6, M[12]) + orn x10, x9, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w6, w6, w21 // Add dest value + eor x10, x17, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w4, w6, w4 // Add constant 0x8f0ccc92 + add w6, w4, w10 // Add aux function result + ror w6, w6, #22 // Rotate left s=10 bits + movz x4, #0xf47d // .Load lower half of constant 0xffeff47d + movk x4, #0xffef, lsl #16 // .Load upper half of constant 0xffeff47d + add w6, w9, w6 // Add X parameter round 4 D=II(D, A, B, C, 0x8f0ccc92, s=10, M[3]) + add w8, w8, w16 // Add dest value + orn x10, x6, x17 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w8, w8, w4 // Add constant 0xffeff47d + eor x4, x9, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w8, w8, w4 // Add aux function result + ror w8, w8, #17 // Rotate left s=15 bits + movz x4, #0x5dd1 // .Load lower half of constant 0x85845dd1 + movk x4, #0x8584, lsl #16 // .Load upper half of constant 0x85845dd1 + add w8, w6, w8 // Add X parameter round 4 C=II(C, D, A, B, 0xffeff47d, s=15, M[10]) + orn x10, x8, x9 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w15, w17, w20 // Add dest value + eor x17, x6, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w15, w15, w4 // Add constant 0x85845dd1 + add w4, w15, w17 // Add aux function result + ror w4, w4, #11 // Rotate left s=21 bits + movz x15, #0x7e4f // .Load lower half of constant 0x6fa87e4f + movk x15, #0x6fa8, lsl #16 // .Load upper half of constant 0x6fa87e4f + add w17, w8, w4 // Add X parameter round 4 B=II(B, C, D, A, 0x85845dd1, s=21, M[1]) + add w4, w9, w5 // Add dest value + orn x9, x17, x6 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w15, w4, w15 // Add constant 0x6fa87e4f + eor x4, x8, x9 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w9, w15, w4 // Add aux function result + ror w9, w9, #26 // Rotate left s=6 bits + movz x15, #0xe6e0 // .Load lower half of constant 0xfe2ce6e0 + movk x15, #0xfe2c, lsl #16 // .Load upper half of constant 0xfe2ce6e0 + add w4, w17, w9 // Add X parameter round 4 A=II(A, B, C, D, 0x6fa87e4f, s=6, M[8]) + orn x9, x4, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w6, w6, w27 // Add dest value + eor x9, x17, x9 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w15, w6, w15 // Add constant 0xfe2ce6e0 + add w6, w15, w9 // Add aux function result + ror w6, w6, #22 // Rotate left s=10 bits + movz x9, #0x4314 // .Load lower half of constant 0xa3014314 + movk x9, #0xa301, lsl #16 // .Load upper half of constant 0xa3014314 + add w15, w4, w6 // Add X parameter round 4 D=II(D, A, B, C, 0xfe2ce6e0, s=10, M[15]) + add w6, w8, w7 // Add dest value + orn x7, x15, x17 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w8, w6, w9 // Add constant 0xa3014314 + eor x9, x4, x7 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w6, w8, w9 // Add aux function result + ror w6, w6, #17 // Rotate left s=15 bits + movz x7, #0x11a1 // .Load lower half of constant 0x4e0811a1 + movk x7, #0x4e08, lsl #16 // .Load upper half of constant 0x4e0811a1 + add w8, w15, w6 // Add X parameter round 4 C=II(C, D, A, B, 0xa3014314, s=15, M[6]) + orn x9, x8, x4 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w6, w17, w26 // Add dest value + eor x17, x15, x9 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w9, w6, w7 // Add constant 0x4e0811a1 + add w7, w9, w17 // Add aux function result + ror w7, w7, #11 // Rotate left s=21 bits + movz x6, #0x7e82 // .Load lower half of constant 0xf7537e82 + movk x6, #0xf753, lsl #16 // .Load upper half of constant 0xf7537e82 + add w9, w8, w7 // Add X parameter round 4 B=II(B, C, D, A, 0x4e0811a1, s=21, M[13]) + add w17, w4, w14 // Add dest value + orn x7, x9, x15 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w14, w17, w6 // Add constant 0xf7537e82 + eor x4, x8, x7 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w17, w14, w4 // Add aux function result + ror w17, w17, #26 // Rotate left s=6 bits + movz x6, #0xf235 // .Load lower half of constant 0xbd3af235 + movk x6, #0xbd3a, lsl #16 // .Load upper half of constant 0xbd3af235 + add w7, w9, w17 // Add X parameter round 4 A=II(A, B, C, D, 0xf7537e82, s=6, M[4]) + orn x14, x7, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w4, w15, w25 // Add dest value + eor x17, x9, x14 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w15, w4, w6 // Add constant 0xbd3af235 + add w16, w15, w17 // Add aux function result + ror w16, w16, #22 // Rotate left s=10 bits + movz x14, #0xd2bb // .Load lower half of constant 0x2ad7d2bb + movk x14, #0x2ad7, lsl #16 // .Load upper half of constant 0x2ad7d2bb + add w4, w7, w16 // Add X parameter round 4 D=II(D, A, B, C, 0xbd3af235, s=10, M[11]) + add w6, w8, w3 // Add dest value + orn x15, x4, x9 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w17, w6, w14 // Add constant 0x2ad7d2bb + eor x16, x7, x15 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w8, w17, w16 // Add aux function result + ror w8, w8, #17 // Rotate left s=15 bits + movz x3, #0xd391 // .Load lower half of constant 0xeb86d391 + movk x3, #0xeb86, lsl #16 // .Load upper half of constant 0xeb86d391 + add w14, w4, w8 // Add X parameter round 4 C=II(C, D, A, B, 0x2ad7d2bb, s=15, M[2]) + orn x6, x14, x7 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w15, w9, w24 // Add dest value + eor x17, x4, x6 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w16, w15, w3 // Add constant 0xeb86d391 + add w8, w16, w17 // Add aux function result + ror w8, w8, #11 // Rotate left s=21 bits + ldp w6, w15, [x0] // Reload MD5 state->A and state->B + ldp w5, w9, [x0, #8] // Reload MD5 state->C and state->D + add w3, w14, w8 // Add X parameter round 4 B=II(B, C, D, A, 0xeb86d391, s=21, M[9]) + add w13, w4, w9 // Add result of MD5 rounds to state->D + add w12, w14, w5 // Add result of MD5 rounds to state->C + add w10, w7, w6 // Add result of MD5 rounds to state->A + add w11, w3, w15 // Add result of MD5 rounds to state->B + stp w12, w13, [x0, #8] // Store MD5 states C,D + stp w10, w11, [x0] // Store MD5 states A,B + add x1, x1, #64 // Increment data pointer + subs w2, w2, #1 // Decrement block counter + b.ne ossl_md5_blocks_loop + + ldp x21,x22,[sp,#16] + ldp x23,x24,[sp,#32] + ldp x25,x26,[sp,#48] + ldp x27,x28,[sp,#64] + ldp x19,x20,[sp],#80 + ret + diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/modes/aes-gcm-armv8-unroll8_64.S b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/modes/aes-gcm-armv8-unroll8_64.S new file mode 100644 index 000000000000..218bcc13dc1a --- /dev/null +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/modes/aes-gcm-armv8-unroll8_64.S @@ -0,0 +1,8487 @@ +#include "arm_arch.h" + +#if __ARM_MAX_ARCH__>=8 +.arch armv8-a+crypto +.text +.globl unroll8_eor3_aes_gcm_enc_128_kernel +.type unroll8_eor3_aes_gcm_enc_128_kernel,%function +.align 4 +unroll8_eor3_aes_gcm_enc_128_kernel: + AARCH64_VALID_CALL_TARGET + cbz x1, .L128_enc_ret + stp d8, d9, [sp, #-80]! + lsr x9, x1, #3 + mov x16, x4 + mov x8, x5 + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + mov x5, #0xc200000000000000 + stp x5, xzr, [sp, #64] + add x10, sp, #64 + + mov x15, #0x100000000 //set up counter increment + movi v31.16b, #0x0 + mov v31.d[1], x15 + mov x5, x9 + ld1 { v0.16b}, [x16] //CTR block 0 + + sub x5, x5, #1 //byte_len - 1 + + and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + rev32 v30.16b, v0.16b //set up reversed counter + + add v30.4s, v30.4s, v31.4s //CTR block 0 + + rev32 v1.16b, v30.16b //CTR block 1 + add v30.4s, v30.4s, v31.4s //CTR block 1 + + rev32 v2.16b, v30.16b //CTR block 2 + add v30.4s, v30.4s, v31.4s //CTR block 2 + + rev32 v3.16b, v30.16b //CTR block 3 + add v30.4s, v30.4s, v31.4s //CTR block 3 + + rev32 v4.16b, v30.16b //CTR block 4 + add v30.4s, v30.4s, v31.4s //CTR block 4 + + rev32 v5.16b, v30.16b //CTR block 5 + add v30.4s, v30.4s, v31.4s //CTR block 5 + ldp q26, q27, [x8, #0] //load rk0, rk1 + + rev32 v6.16b, v30.16b //CTR block 6 + add v30.4s, v30.4s, v31.4s //CTR block 6 + + rev32 v7.16b, v30.16b //CTR block 7 + add v30.4s, v30.4s, v31.4s //CTR block 7 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 0 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 0 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 0 + ldp q28, q26, [x8, #32] //load rk2, rk3 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 1 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 1 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 1 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 2 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 2 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 2 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 2 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + + ldp q27, q28, [x8, #64] //load rk4, rk5 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 3 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 3 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 3 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 3 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 4 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 4 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 4 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 5 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + ldp q26, q27, [x8, #96] //load rk6, rk7 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 5 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 5 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 5 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 6 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 6 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 6 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + ldp q28, q26, [x8, #128] //load rk8, rk9 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 7 + + ld1 { v19.16b}, [x3] + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 7 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 7 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 7 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + ldr q27, [x8, #160] //load rk10 + + aese v3.16b, v26.16b //AES block 8k+11 - round 9 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + aese v2.16b, v26.16b //AES block 8k+10 - round 9 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + aese v6.16b, v26.16b //AES block 8k+14 - round 9 + + aese v4.16b, v26.16b //AES block 8k+12 - round 9 + add x5, x5, x0 + aese v0.16b, v26.16b //AES block 8k+8 - round 9 + + aese v7.16b, v26.16b //AES block 8k+15 - round 9 + aese v5.16b, v26.16b //AES block 8k+13 - round 9 + aese v1.16b, v26.16b //AES block 8k+9 - round 9 + + add x4, x0, x1, lsr #3 //end_input_ptr + cmp x0, x5 //check if we have <= 8 blocks + b.ge .L128_enc_tail //handle tail + + ldp q8, q9, [x0], #32 //AES block 0, 1 - load plaintext + + ldp q10, q11, [x0], #32 //AES block 2, 3 - load plaintext + + ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext + + ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext + cmp x0, x5 //check if we have <= 8 blocks + +.inst 0xce006d08 //eor3 v8.16b, v8.16b, v0.16b, v27.16b //AES block 0 - result + rev32 v0.16b, v30.16b //CTR block 8 + add v30.4s, v30.4s, v31.4s //CTR block 8 + +.inst 0xce016d29 //eor3 v9.16b, v9.16b, v1.16b, v27.16b //AES block 1 - result + stp q8, q9, [x2], #32 //AES block 0, 1 - store result + + rev32 v1.16b, v30.16b //CTR block 9 +.inst 0xce056dad //eor3 v13.16b, v13.16b, v5.16b, v27.16b //AES block 5 - result + add v30.4s, v30.4s, v31.4s //CTR block 9 + +.inst 0xce026d4a //eor3 v10.16b, v10.16b, v2.16b, v27.16b //AES block 2 - result +.inst 0xce066dce //eor3 v14.16b, v14.16b, v6.16b, v27.16b //AES block 6 - result +.inst 0xce046d8c //eor3 v12.16b, v12.16b, v4.16b, v27.16b //AES block 4 - result + + rev32 v2.16b, v30.16b //CTR block 10 + add v30.4s, v30.4s, v31.4s //CTR block 10 + +.inst 0xce036d6b //eor3 v11.16b, v11.16b, v3.16b, v27.16b //AES block 3 - result +.inst 0xce076def //eor3 v15.16b, v15.16b, v7.16b,v27.16b //AES block 7 - result + stp q10, q11, [x2], #32 //AES block 2, 3 - store result + + rev32 v3.16b, v30.16b //CTR block 11 + add v30.4s, v30.4s, v31.4s //CTR block 11 + stp q12, q13, [x2], #32 //AES block 4, 5 - store result + + stp q14, q15, [x2], #32 //AES block 6, 7 - store result + + rev32 v4.16b, v30.16b //CTR block 12 + add v30.4s, v30.4s, v31.4s //CTR block 12 + b.ge .L128_enc_prepretail //do prepretail + +.L128_enc_main_loop: //main loop start + rev32 v5.16b, v30.16b //CTR block 8k+13 + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + + rev64 v9.16b, v9.16b //GHASH block 8k+1 + rev64 v8.16b, v8.16b //GHASH block 8k + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + + rev32 v6.16b, v30.16b //CTR block 8k+14 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free) + rev64 v11.16b, v11.16b //GHASH block 8k+3 + + ldp q26, q27, [x8, #0] //load rk0, rk1 + eor v8.16b, v8.16b, v19.16b //PRE 1 + rev32 v7.16b, v30.16b //CTR block 8k+15 + + rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free) + + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + rev64 v10.16b, v10.16b //GHASH block 8k+2 + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h3l | h3h + ext v25.16b, v25.16b, v25.16b, #8 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b,v9.16b //GHASH block 8k+2, 8k+3 - high + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + + ldp q28, q26, [x8, #32] //load rk2, rk3 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + + rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free) +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free) + + ldp q27, q28, [x8, #64] //load rk4, rk5 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h1l | h1h + ext v22.16b, v22.16b, v22.16b, #8 + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + ldp q26, q27, [x8, #96] //load rk6, rk7 + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + ldr d16, [x10] //MODULO - load modulo constant + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low + ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load plaintext + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 + rev32 v20.16b, v30.16b //CTR block 8k+16 + add v30.4s, v30.4s, v31.4s //CTR block 8k+16 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + ldp q28, q26, [x8, #128] //load rk8, rk9 +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load plaintext + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + + pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 + + rev32 v22.16b, v30.16b //CTR block 8k+17 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load plaintext + add v30.4s, v30.4s, v31.4s //CTR block 8k+17 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + ldr q27, [x8, #160] //load rk10 + + ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + rev32 v23.16b, v30.16b //CTR block 8k+18 + add v30.4s, v30.4s, v31.4s //CTR block 8k+18 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + + aese v2.16b, v26.16b //AES block 8k+10 - round 9 + aese v4.16b, v26.16b //AES block 8k+12 - round 9 + aese v1.16b, v26.16b //AES block 8k+9 - round 9 + + ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load plaintext + rev32 v25.16b, v30.16b //CTR block 8k+19 + add v30.4s, v30.4s, v31.4s //CTR block 8k+19 + + cmp x0, x5 //.LOOP CONTROL +.inst 0xce046d8c //eor3 v12.16b, v12.16b, v4.16b, v27.16b //AES block 4 - result + aese v7.16b, v26.16b //AES block 8k+15 - round 9 + + aese v6.16b, v26.16b //AES block 8k+14 - round 9 + aese v3.16b, v26.16b //AES block 8k+11 - round 9 + +.inst 0xce026d4a //eor3 v10.16b, v10.16b, v2.16b, v27.16b //AES block 8k+10 - result + + mov v2.16b, v23.16b //CTR block 8k+18 + aese v0.16b, v26.16b //AES block 8k+8 - round 9 + + rev32 v4.16b, v30.16b //CTR block 8k+20 + add v30.4s, v30.4s, v31.4s //CTR block 8k+20 + +.inst 0xce076def //eor3 v15.16b, v15.16b, v7.16b, v27.16b //AES block 7 - result + aese v5.16b, v26.16b //AES block 8k+13 - round 9 + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + +.inst 0xce016d29 //eor3 v9.16b, v9.16b, v1.16b, v27.16b //AES block 8k+9 - result +.inst 0xce036d6b //eor3 v11.16b, v11.16b, v3.16b, v27.16b //AES block 8k+11 - result + mov v3.16b, v25.16b //CTR block 8k+19 + + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment +.inst 0xce056dad //eor3 v13.16b, v13.16b, v5.16b, v27.16b //AES block 5 - result + mov v1.16b, v22.16b //CTR block 8k+17 + +.inst 0xce006d08 //eor3 v8.16b, v8.16b, v0.16b, v27.16b //AES block 8k+8 - result + mov v0.16b, v20.16b //CTR block 8k+16 + stp q8, q9, [x2], #32 //AES block 8k+8, 8k+9 - store result + + stp q10, q11, [x2], #32 //AES block 8k+10, 8k+11 - store result +.inst 0xce066dce //eor3 v14.16b, v14.16b, v6.16b, v27.16b //AES block 6 - result + + stp q12, q13, [x2], #32 //AES block 8k+12, 8k+13 - store result +.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low + + stp q14, q15, [x2], #32 //AES block 8k+14, 8k+15 - store result + b.lt .L128_enc_main_loop + +.L128_enc_prepretail: //PREPRETAIL + rev32 v5.16b, v30.16b //CTR block 8k+13 + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + rev64 v8.16b, v8.16b //GHASH block 8k + rev64 v9.16b, v9.16b //GHASH block 8k+1 + + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h6k | h5k + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + rev64 v11.16b, v11.16b //GHASH block 8k+3 + + rev64 v10.16b, v10.16b //GHASH block 8k+2 + eor v8.16b, v8.16b, v19.16b //PRE 1 + + rev32 v6.16b, v30.16b //CTR block 8k+14 + + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + + rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free) + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + + ldp q26, q27, [x8, #0] //load rk0, rk1 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + + rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free) + rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free) + + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + + rev32 v7.16b, v30.16b //CTR block 8k+15 + + rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free) + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + + ldp q28, q26, [x8, #32] //load rk2, rk3 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + ldp q27, q28, [x8, #64] //load rk4, rk5 + + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h1l | h1h + ext v22.16b, v22.16b, v22.16b, #8 + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high + ldp q26, q27, [x8, #96] //load rk6, rk7 + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low + +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + ldr d16, [x10] //MODULO - load modulo constant + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + + pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + ldp q28, q26, [x8, #128] //load rk8, rk9 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 +.inst 0xce114a73 //eor3 v19.16b, v19.16b, v17.16b, v18.16b //MODULO - fold into low + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + + ldr q27, [x8, #160] //load rk10 + aese v6.16b, v26.16b //AES block 8k+14 - round 9 + aese v2.16b, v26.16b //AES block 8k+10 - round 9 + + aese v0.16b, v26.16b //AES block 8k+8 - round 9 + aese v1.16b, v26.16b //AES block 8k+9 - round 9 + + aese v3.16b, v26.16b //AES block 8k+11 - round 9 + aese v5.16b, v26.16b //AES block 8k+13 - round 9 + + aese v4.16b, v26.16b //AES block 8k+12 - round 9 + aese v7.16b, v26.16b //AES block 8k+15 - round 9 +.L128_enc_tail: //TAIL + + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + ldr q8, [x0], #16 //AES block 8k+8 - load plaintext + + mov v29.16b, v27.16b + ldp q20, q21, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + +.inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result + ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag + ldp q22, q23, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + ext v23.16b, v23.16b, v23.16b, #8 + + ldp q24, q25, [x3, #192] //load h8k | h7k + ext v25.16b, v25.16b, v25.16b, #8 + cmp x5, #112 + b.gt .L128_enc_blocks_more_than_7 + + mov v7.16b, v6.16b + mov v6.16b, v5.16b + movi v17.8b, #0 + + cmp x5, #96 + sub v30.4s, v30.4s, v31.4s + mov v5.16b, v4.16b + + mov v4.16b, v3.16b + mov v3.16b, v2.16b + mov v2.16b, v1.16b + + movi v19.8b, #0 + movi v18.8b, #0 + b.gt .L128_enc_blocks_more_than_6 + + mov v7.16b, v6.16b + cmp x5, #80 + + sub v30.4s, v30.4s, v31.4s + mov v6.16b, v5.16b + mov v5.16b, v4.16b + + mov v4.16b, v3.16b + mov v3.16b, v1.16b + b.gt .L128_enc_blocks_more_than_5 + + cmp x5, #64 + sub v30.4s, v30.4s, v31.4s + + mov v7.16b, v6.16b + mov v6.16b, v5.16b + + mov v5.16b, v4.16b + mov v4.16b, v1.16b + b.gt .L128_enc_blocks_more_than_4 + + mov v7.16b, v6.16b + sub v30.4s, v30.4s, v31.4s + mov v6.16b, v5.16b + + mov v5.16b, v1.16b + cmp x5, #48 + b.gt .L128_enc_blocks_more_than_3 + + sub v30.4s, v30.4s, v31.4s + mov v7.16b, v6.16b + mov v6.16b, v1.16b + + cmp x5, #32 + ldr q24, [x3, #96] //load h4k | h3k + b.gt .L128_enc_blocks_more_than_2 + + cmp x5, #16 + + sub v30.4s, v30.4s, v31.4s + mov v7.16b, v1.16b + b.gt .L128_enc_blocks_more_than_1 + + ldr q21, [x3, #48] //load h2k | h1k + sub v30.4s, v30.4s, v31.4s + b .L128_enc_blocks_less_than_1 +.L128_enc_blocks_more_than_7: //blocks left > 7 + st1 { v9.16b}, [x2], #16 //AES final-7 block - store result + + rev64 v8.16b, v9.16b //GHASH final-7 block + ldr q9, [x0], #16 //AES final-6 block - load plaintext + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-7 block - mid + + pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high + + ins v18.d[0], v24.d[1] //GHASH final-7 block - mid + + eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid + movi v16.8b, #0 //suppress further partial tag feed in + +.inst 0xce017529 //eor3 v9.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result + + pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid + pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low +.L128_enc_blocks_more_than_6: //blocks left > 6 + + st1 { v9.16b}, [x2], #16 //AES final-6 block - store result + + rev64 v8.16b, v9.16b //GHASH final-6 block + ldr q9, [x0], #16 //AES final-5 block - load plaintext + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-6 block - mid + +.inst 0xce027529 //eor3 v9.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result + pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low + + eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid + movi v16.8b, #0 //suppress further partial tag feed in + + pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high + + eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high +.L128_enc_blocks_more_than_5: //blocks left > 5 + + st1 { v9.16b}, [x2], #16 //AES final-5 block - store result + + rev64 v8.16b, v9.16b //GHASH final-5 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-5 block - mid + ldr q9, [x0], #16 //AES final-4 block - load plaintext + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high + + eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high + + eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid + + ins v27.d[1], v27.d[0] //GHASH final-5 block - mid + +.inst 0xce037529 //eor3 v9.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result + pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low + movi v16.8b, #0 //suppress further partial tag feed in + + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid + eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid +.L128_enc_blocks_more_than_4: //blocks left > 4 + + st1 { v9.16b}, [x2], #16 //AES final-4 block - store result + + rev64 v8.16b, v9.16b //GHASH final-4 block + + ldr q9, [x0], #16 //AES final-3 block - load plaintext + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-4 block - mid + movi v16.8b, #0 //suppress further partial tag feed in + pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high + + eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid + + pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low + + eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high + pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid + + eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low + +.inst 0xce047529 //eor3 v9.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result + eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid +.L128_enc_blocks_more_than_3: //blocks left > 3 + + st1 { v9.16b}, [x2], #16 //AES final-3 block - store result + + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + + rev64 v8.16b, v9.16b //GHASH final-3 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + movi v16.8b, #0 //suppress further partial tag feed in + + ins v27.d[0], v8.d[1] //GHASH final-3 block - mid + ldr q24, [x3, #96] //load h4k | h3k + pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low + + ldr q9, [x0], #16 //AES final-2 block - load plaintext + + eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid + + ins v27.d[1], v27.d[0] //GHASH final-3 block - mid + eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low + +.inst 0xce057529 //eor3 v9.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result + + pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid + pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high + + eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high +.L128_enc_blocks_more_than_2: //blocks left > 2 + + st1 { v9.16b}, [x2], #16 //AES final-2 block - store result + + rev64 v8.16b, v9.16b //GHASH final-2 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ldr q9, [x0], #16 //AES final-1 block - load plaintext + + ins v27.d[0], v8.d[1] //GHASH final-2 block - mid + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + movi v16.8b, #0 //suppress further partial tag feed in + + eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid +.inst 0xce067529 //eor3 v9.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result + + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high + + pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low + pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid + + eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high + + eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid + eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low +.L128_enc_blocks_more_than_1: //blocks left > 1 + + st1 { v9.16b}, [x2], #16 //AES final-1 block - store result + + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + rev64 v8.16b, v9.16b //GHASH final-1 block + ldr q9, [x0], #16 //AES final block - load plaintext + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + movi v16.8b, #0 //suppress further partial tag feed in + ins v27.d[0], v8.d[1] //GHASH final-1 block - mid +.inst 0xce077529 //eor3 v9.16b, v9.16b, v7.16b, v29.16b //AES final block - result + + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high + + eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid + + ldr q21, [x3, #48] //load h2k | h1k + + ins v27.d[1], v27.d[0] //GHASH final-1 block - mid + + pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid + + eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high + + eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid + eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low +.L128_enc_blocks_less_than_1: //blocks left <= 1 + + rev32 v30.16b, v30.16b + str q30, [x16] //store the updated counter + and x1, x1, #127 //bit_length %= 128 + + sub x1, x1, #128 //bit_length -= 128 + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + + mvn x6, xzr //temp0_x = 0xffffffffffffffff + ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored + and x1, x1, #127 //bit_length %= 128 + + lsr x6, x6, x1 //temp0_x is mask for top 64b of last block + mvn x7, xzr //temp1_x = 0xffffffffffffffff + cmp x1, #64 + + csel x13, x7, x6, lt + csel x14, x6, xzr, lt + + mov v0.d[1], x14 + mov v0.d[0], x13 //ctr0b is mask for last block + + and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits + + rev64 v8.16b, v9.16b //GHASH final block + + bif v9.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing + st1 { v9.16b}, [x2] //store all 16B + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v16.d[0], v8.d[1] //GHASH final block - mid + + eor v16.8b, v16.8b, v8.8b //GHASH final block - mid + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + + pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid + + pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high + eor v18.16b, v18.16b, v16.16b //GHASH final block - mid + ldr d16, [x10] //MODULO - load modulo constant + + pmull v26.1q, v8.1d, v20.1d //GHASH final block - low + + eor v17.16b, v17.16b, v28.16b //GHASH final block - high + + eor v19.16b, v19.16b, v26.16b //GHASH final block - low + + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + +.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + st1 { v19.16b }, [x3] + mov x0, x9 + + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + ldp d8, d9, [sp], #80 + ret + +.L128_enc_ret: + mov w0, #0x0 + ret +.size unroll8_eor3_aes_gcm_enc_128_kernel,.-unroll8_eor3_aes_gcm_enc_128_kernel +.globl unroll8_eor3_aes_gcm_dec_128_kernel +.type unroll8_eor3_aes_gcm_dec_128_kernel,%function +.align 4 +unroll8_eor3_aes_gcm_dec_128_kernel: + AARCH64_VALID_CALL_TARGET + cbz x1, .L128_dec_ret + stp d8, d9, [sp, #-80]! + lsr x9, x1, #3 + mov x16, x4 + mov x8, x5 + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + mov x5, #0xc200000000000000 + stp x5, xzr, [sp, #64] + add x10, sp, #64 + + mov x5, x9 + ld1 { v0.16b}, [x16] //CTR block 0 + + ldp q26, q27, [x8, #0] //load rk0, rk1 + sub x5, x5, #1 //byte_len - 1 + + mov x15, #0x100000000 //set up counter increment + movi v31.16b, #0x0 + mov v31.d[1], x15 + ld1 { v19.16b}, [x3] + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + + rev32 v30.16b, v0.16b //set up reversed counter + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + + add v30.4s, v30.4s, v31.4s //CTR block 0 + + rev32 v1.16b, v30.16b //CTR block 1 + add v30.4s, v30.4s, v31.4s //CTR block 1 + + and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + rev32 v2.16b, v30.16b //CTR block 2 + add v30.4s, v30.4s, v31.4s //CTR block 2 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + + rev32 v3.16b, v30.16b //CTR block 3 + add v30.4s, v30.4s, v31.4s //CTR block 3 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + + rev32 v4.16b, v30.16b //CTR block 4 + add v30.4s, v30.4s, v31.4s //CTR block 4 + + rev32 v5.16b, v30.16b //CTR block 5 + add v30.4s, v30.4s, v31.4s //CTR block 5 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + + rev32 v6.16b, v30.16b //CTR block 6 + add v30.4s, v30.4s, v31.4s //CTR block 6 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 0 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 0 + + rev32 v7.16b, v30.16b //CTR block 7 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 0 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 0 + + ldp q28, q26, [x8, #32] //load rk2, rk3 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 1 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 1 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 1 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 2 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 2 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 2 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 3 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + + ldp q27, q28, [x8, #64] //load rk4, rk5 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 3 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 3 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 4 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 3 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 4 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 4 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + + ldp q26, q27, [x8, #96] //load rk6, rk7 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 5 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 5 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 5 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 5 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 6 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 6 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 6 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 7 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 7 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 7 + ldp q28, q26, [x8, #128] //load rk8, rk9 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 7 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + + add x5, x5, x0 + add v30.4s, v30.4s, v31.4s //CTR block 7 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 8 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 8 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 8 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 8 + + aese v0.16b, v26.16b //AES block 0 - round 9 + aese v1.16b, v26.16b //AES block 1 - round 9 + aese v6.16b, v26.16b //AES block 6 - round 9 + + ldr q27, [x8, #160] //load rk10 + aese v4.16b, v26.16b //AES block 4 - round 9 + aese v3.16b, v26.16b //AES block 3 - round 9 + + aese v2.16b, v26.16b //AES block 2 - round 9 + aese v5.16b, v26.16b //AES block 5 - round 9 + aese v7.16b, v26.16b //AES block 7 - round 9 + + add x4, x0, x1, lsr #3 //end_input_ptr + cmp x0, x5 //check if we have <= 8 blocks + b.ge .L128_dec_tail //handle tail + + ldp q8, q9, [x0], #32 //AES block 0, 1 - load ciphertext + +.inst 0xce006d00 //eor3 v0.16b, v8.16b, v0.16b, v27.16b //AES block 0 - result +.inst 0xce016d21 //eor3 v1.16b, v9.16b, v1.16b, v27.16b //AES block 1 - result + stp q0, q1, [x2], #32 //AES block 0, 1 - store result + + rev32 v0.16b, v30.16b //CTR block 8 + add v30.4s, v30.4s, v31.4s //CTR block 8 + ldp q10, q11, [x0], #32 //AES block 2, 3 - load ciphertext + + ldp q12, q13, [x0], #32 //AES block 4, 5 - load ciphertext + + rev32 v1.16b, v30.16b //CTR block 9 + add v30.4s, v30.4s, v31.4s //CTR block 9 + ldp q14, q15, [x0], #32 //AES block 6, 7 - load ciphertext + +.inst 0xce036d63 //eor3 v3.16b, v11.16b, v3.16b, v27.16b //AES block 3 - result +.inst 0xce026d42 //eor3 v2.16b, v10.16b, v2.16b, v27.16b //AES block 2 - result + stp q2, q3, [x2], #32 //AES block 2, 3 - store result + + rev32 v2.16b, v30.16b //CTR block 10 + add v30.4s, v30.4s, v31.4s //CTR block 10 + +.inst 0xce066dc6 //eor3 v6.16b, v14.16b, v6.16b, v27.16b //AES block 6 - result + + rev32 v3.16b, v30.16b //CTR block 11 + add v30.4s, v30.4s, v31.4s //CTR block 11 + +.inst 0xce046d84 //eor3 v4.16b, v12.16b, v4.16b, v27.16b //AES block 4 - result +.inst 0xce056da5 //eor3 v5.16b, v13.16b, v5.16b, v27.16b //AES block 5 - result + stp q4, q5, [x2], #32 //AES block 4, 5 - store result + +.inst 0xce076de7 //eor3 v7.16b, v15.16b, v7.16b, v27.16b //AES block 7 - result + stp q6, q7, [x2], #32 //AES block 6, 7 - store result + rev32 v4.16b, v30.16b //CTR block 12 + + cmp x0, x5 //check if we have <= 8 blocks + add v30.4s, v30.4s, v31.4s //CTR block 12 + b.ge .L128_dec_prepretail //do prepretail + +.L128_dec_main_loop: //main loop start + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + + rev64 v9.16b, v9.16b //GHASH block 8k+1 + rev64 v8.16b, v8.16b //GHASH block 8k + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + + rev64 v14.16b, v14.16b //GHASH block 8k+6 + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + + eor v8.16b, v8.16b, v19.16b //PRE 1 + rev32 v5.16b, v30.16b //CTR block 8k+13 + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + + rev64 v10.16b, v10.16b //GHASH block 8k+2 + rev64 v12.16b, v12.16b //GHASH block 8k+4 + ldp q26, q27, [x8, #0] //load rk0, rk1 + + rev32 v6.16b, v30.16b //CTR block 8k+14 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + rev64 v11.16b, v11.16b //GHASH block 8k+3 + + rev32 v7.16b, v30.16b //CTR block 8k+15 + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + rev64 v13.16b, v13.16b //GHASH block 8k+5 + + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + + ldp q28, q26, [x8, #32] //load rk2, rk3 + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + rev64 v15.16b, v15.16b //GHASH block 8k+7 + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + + ldp q27, q28, [x8, #64] //load rk4, rk5 + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + + ldp q26, q27, [x8, #96] //load rk6, rk7 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low + + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + ldp q28, q26, [x8, #128] //load rk8, rk9 + + ldr d16, [x10] //MODULO - load modulo constant +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + + rev32 v20.16b, v30.16b //CTR block 8k+16 +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + add v30.4s, v30.4s, v31.4s //CTR block 8k+16 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + rev32 v22.16b, v30.16b //CTR block 8k+17 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 + add v30.4s, v30.4s, v31.4s //CTR block 8k+17 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load ciphertext + + ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load ciphertext + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + rev32 v23.16b, v30.16b //CTR block 8k+18 + + ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load ciphertext + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + + ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load ciphertext + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + add v30.4s, v30.4s, v31.4s //CTR block 8k+18 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + + aese v0.16b, v26.16b //AES block 8k+8 - round 9 + aese v1.16b, v26.16b //AES block 8k+9 - round 9 + ldr q27, [x8, #160] //load rk10 + + aese v6.16b, v26.16b //AES block 8k+14 - round 9 + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + aese v2.16b, v26.16b //AES block 8k+10 - round 9 + + aese v7.16b, v26.16b //AES block 8k+15 - round 9 + aese v4.16b, v26.16b //AES block 8k+12 - round 9 + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + + rev32 v25.16b, v30.16b //CTR block 8k+19 + add v30.4s, v30.4s, v31.4s //CTR block 8k+19 + + aese v3.16b, v26.16b //AES block 8k+11 - round 9 + aese v5.16b, v26.16b //AES block 8k+13 - round 9 +.inst 0xce016d21 //eor3 v1.16b, v9.16b, v1.16b, v27.16b //AES block 8k+9 - result + +.inst 0xce006d00 //eor3 v0.16b, v8.16b, v0.16b, v27.16b //AES block 8k+8 - result +.inst 0xce076de7 //eor3 v7.16b, v15.16b, v7.16b, v27.16b //AES block 8k+15 - result +.inst 0xce066dc6 //eor3 v6.16b, v14.16b, v6.16b, v27.16b //AES block 8k+14 - result + +.inst 0xce026d42 //eor3 v2.16b, v10.16b, v2.16b, v27.16b //AES block 8k+10 - result + stp q0, q1, [x2], #32 //AES block 8k+8, 8k+9 - store result + mov v1.16b, v22.16b //CTR block 8k+17 + +.inst 0xce046d84 //eor3 v4.16b, v12.16b, v4.16b, v27.16b //AES block 8k+12 - result +.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low + mov v0.16b, v20.16b //CTR block 8k+16 + +.inst 0xce036d63 //eor3 v3.16b, v11.16b, v3.16b, v27.16b //AES block 8k+11 - result + cmp x0, x5 //.LOOP CONTROL + stp q2, q3, [x2], #32 //AES block 8k+10, 8k+11 - store result + +.inst 0xce056da5 //eor3 v5.16b, v13.16b, v5.16b, v27.16b //AES block 8k+13 - result + mov v2.16b, v23.16b //CTR block 8k+18 + + stp q4, q5, [x2], #32 //AES block 8k+12, 8k+13 - store result + rev32 v4.16b, v30.16b //CTR block 8k+20 + add v30.4s, v30.4s, v31.4s //CTR block 8k+20 + + stp q6, q7, [x2], #32 //AES block 8k+14, 8k+15 - store result + mov v3.16b, v25.16b //CTR block 8k+19 + b.lt .L128_dec_main_loop + +.L128_dec_prepretail: //PREPRETAIL + rev64 v11.16b, v11.16b //GHASH block 8k+3 + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + rev64 v8.16b, v8.16b //GHASH block 8k + + rev64 v10.16b, v10.16b //GHASH block 8k+2 + rev32 v5.16b, v30.16b //CTR block 8k+13 + ldp q26, q27, [x8, #0] //load rk0, rk1 + + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + eor v8.16b, v8.16b, v19.16b //PRE 1 + rev64 v9.16b, v9.16b //GHASH block 8k+1 + + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + rev64 v13.16b, v13.16b //GHASH block 8k+5 + + rev64 v12.16b, v12.16b //GHASH block 8k+4 + + rev64 v14.16b, v14.16b //GHASH block 8k+6 + + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + rev32 v6.16b, v30.16b //CTR block 8k+14 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + rev32 v7.16b, v30.16b //CTR block 8k+15 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + + ldp q28, q26, [x8, #32] //load rk2, rk3 +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + ldp q27, q28, [x8, #64] //load rk4, rk5 + rev64 v15.16b, v15.16b //GHASH block 8k+7 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + + ldp q26, q27, [x8, #96] //load rk6, rk7 +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + + ldr d16, [x10] //MODULO - load modulo constant + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + ldp q28, q26, [x8, #128] //load rk8, rk9 + + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 + +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + ldr q27, [x8, #160] //load rk10 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + + aese v6.16b, v26.16b //AES block 8k+14 - round 9 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + +.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + aese v2.16b, v26.16b //AES block 8k+10 - round 9 + + aese v3.16b, v26.16b //AES block 8k+11 - round 9 + aese v5.16b, v26.16b //AES block 8k+13 - round 9 + aese v0.16b, v26.16b //AES block 8k+8 - round 9 + + aese v4.16b, v26.16b //AES block 8k+12 - round 9 + aese v1.16b, v26.16b //AES block 8k+9 - round 9 + aese v7.16b, v26.16b //AES block 8k+15 - round 9 + +.L128_dec_tail: //TAIL + + mov v29.16b, v27.16b + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + + cmp x5, #112 + + ldp q24, q25, [x3, #192] //load h8k | h7k + ext v25.16b, v25.16b, v25.16b, #8 + ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext + + ldp q20, q21, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag + + ldp q22, q23, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + ext v23.16b, v23.16b, v23.16b, #8 + +.inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result + b.gt .L128_dec_blocks_more_than_7 + + cmp x5, #96 + mov v7.16b, v6.16b + movi v19.8b, #0 + + movi v17.8b, #0 + mov v6.16b, v5.16b + mov v5.16b, v4.16b + + mov v4.16b, v3.16b + mov v3.16b, v2.16b + mov v2.16b, v1.16b + + movi v18.8b, #0 + sub v30.4s, v30.4s, v31.4s + b.gt .L128_dec_blocks_more_than_6 + + cmp x5, #80 + sub v30.4s, v30.4s, v31.4s + + mov v7.16b, v6.16b + mov v6.16b, v5.16b + mov v5.16b, v4.16b + + mov v4.16b, v3.16b + mov v3.16b, v1.16b + b.gt .L128_dec_blocks_more_than_5 + + cmp x5, #64 + + mov v7.16b, v6.16b + mov v6.16b, v5.16b + mov v5.16b, v4.16b + + mov v4.16b, v1.16b + sub v30.4s, v30.4s, v31.4s + b.gt .L128_dec_blocks_more_than_4 + + sub v30.4s, v30.4s, v31.4s + mov v7.16b, v6.16b + mov v6.16b, v5.16b + + mov v5.16b, v1.16b + cmp x5, #48 + b.gt .L128_dec_blocks_more_than_3 + + sub v30.4s, v30.4s, v31.4s + mov v7.16b, v6.16b + cmp x5, #32 + + ldr q24, [x3, #96] //load h4k | h3k + mov v6.16b, v1.16b + b.gt .L128_dec_blocks_more_than_2 + + cmp x5, #16 + + mov v7.16b, v1.16b + sub v30.4s, v30.4s, v31.4s + b.gt .L128_dec_blocks_more_than_1 + + sub v30.4s, v30.4s, v31.4s + ldr q21, [x3, #48] //load h2k | h1k + b .L128_dec_blocks_less_than_1 +.L128_dec_blocks_more_than_7: //blocks left > 7 + rev64 v8.16b, v9.16b //GHASH final-7 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v18.d[0], v24.d[1] //GHASH final-7 block - mid + + pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low + ins v27.d[0], v8.d[1] //GHASH final-7 block - mid + + movi v16.8b, #0 //suppress further partial tag feed in + ldr q9, [x0], #16 //AES final-6 block - load ciphertext + + eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid + + pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high + st1 { v12.16b}, [x2], #16 //AES final-7 block - store result +.inst 0xce01752c //eor3 v12.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result + + pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid +.L128_dec_blocks_more_than_6: //blocks left > 6 + + rev64 v8.16b, v9.16b //GHASH final-6 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-6 block - mid + + eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid + + pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low + ldr q9, [x0], #16 //AES final-5 block - load ciphertext + movi v16.8b, #0 //suppress further partial tag feed in + + pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid + st1 { v12.16b}, [x2], #16 //AES final-6 block - store result + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high + + eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low + eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high + + eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid +.inst 0xce02752c //eor3 v12.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result +.L128_dec_blocks_more_than_5: //blocks left > 5 + + rev64 v8.16b, v9.16b //GHASH final-5 block + + ldr q9, [x0], #16 //AES final-4 block - load ciphertext + st1 { v12.16b}, [x2], #16 //AES final-5 block - store result + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-5 block - mid + +.inst 0xce03752c //eor3 v12.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result + + eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid + + ins v27.d[1], v27.d[0] //GHASH final-5 block - mid + pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low + movi v16.8b, #0 //suppress further partial tag feed in + + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high + eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high +.L128_dec_blocks_more_than_4: //blocks left > 4 + + rev64 v8.16b, v9.16b //GHASH final-4 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + ldr q9, [x0], #16 //AES final-3 block - load ciphertext + + ins v27.d[0], v8.d[1] //GHASH final-4 block - mid + movi v16.8b, #0 //suppress further partial tag feed in + pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high + + pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low + + eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high + + st1 { v12.16b}, [x2], #16 //AES final-4 block - store result + eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid + +.inst 0xce04752c //eor3 v12.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result + eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low + + pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid + + eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid +.L128_dec_blocks_more_than_3: //blocks left > 3 + + st1 { v12.16b}, [x2], #16 //AES final-3 block - store result + rev64 v8.16b, v9.16b //GHASH final-3 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-3 block - mid + + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + ldr q24, [x3, #96] //load h4k | h3k + + eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid + + ldr q9, [x0], #16 //AES final-2 block - load ciphertext + + ins v27.d[1], v27.d[0] //GHASH final-3 block - mid + pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low + pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high + + movi v16.8b, #0 //suppress further partial tag feed in +.inst 0xce05752c //eor3 v12.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result + eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low + + pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid + + eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high + eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid +.L128_dec_blocks_more_than_2: //blocks left > 2 + + rev64 v8.16b, v9.16b //GHASH final-2 block + + st1 { v12.16b}, [x2], #16 //AES final-2 block - store result + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + movi v16.8b, #0 //suppress further partial tag feed in + + ins v27.d[0], v8.d[1] //GHASH final-2 block - mid + + eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid + + pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low + + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high + pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid + ldr q9, [x0], #16 //AES final-1 block - load ciphertext + + eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid + + eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low + +.inst 0xce06752c //eor3 v12.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result + eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high +.L128_dec_blocks_more_than_1: //blocks left > 1 + + st1 { v12.16b}, [x2], #16 //AES final-1 block - store result + rev64 v8.16b, v9.16b //GHASH final-1 block + + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + movi v16.8b, #0 //suppress further partial tag feed in + + ins v27.d[0], v8.d[1] //GHASH final-1 block - mid + + ldr q9, [x0], #16 //AES final block - load ciphertext + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high + + eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high + ldr q21, [x3, #48] //load h2k | h1k + + ins v27.d[1], v27.d[0] //GHASH final-1 block - mid +.inst 0xce07752c //eor3 v12.16b, v9.16b, v7.16b, v29.16b //AES final block - result + + pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low + + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid + + eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid +.L128_dec_blocks_less_than_1: //blocks left <= 1 + + and x1, x1, #127 //bit_length %= 128 + + sub x1, x1, #128 //bit_length -= 128 + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + + mvn x6, xzr //temp0_x = 0xffffffffffffffff + and x1, x1, #127 //bit_length %= 128 + + lsr x6, x6, x1 //temp0_x is mask for top 64b of last block + cmp x1, #64 + mvn x7, xzr //temp1_x = 0xffffffffffffffff + + csel x13, x7, x6, lt + csel x14, x6, xzr, lt + + mov v0.d[1], x14 + mov v0.d[0], x13 //ctr0b is mask for last block + + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored + + and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits + + rev64 v8.16b, v9.16b //GHASH final block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high + ins v16.d[0], v8.d[1] //GHASH final block - mid + + eor v17.16b, v17.16b, v28.16b //GHASH final block - high + eor v16.8b, v16.8b, v8.8b //GHASH final block - mid + + bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing + + pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid + st1 { v12.16b}, [x2] //store all 16B + + pmull v26.1q, v8.1d, v20.1d //GHASH final block - low + + eor v18.16b, v18.16b, v16.16b //GHASH final block - mid + ldr d16, [x10] //MODULO - load modulo constant + + eor v19.16b, v19.16b, v26.16b //GHASH final block - low + + eor v14.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + + pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + ext v17.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + + eor v18.16b, v18.16b, v14.16b //MODULO - karatsuba tidy up + +.inst 0xce115652 //eor3 v18.16b, v18.16b, v17.16b, v21.16b //MODULO - fold into mid + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + +.inst 0xce124673 //eor3 v19.16b, v19.16b, v18.16b, v17.16b //MODULO - fold into low + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + st1 { v19.16b }, [x3] + rev32 v30.16b, v30.16b + + str q30, [x16] //store the updated counter + + mov x0, x9 + + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + ldp d8, d9, [sp], #80 + ret +.L128_dec_ret: + mov w0, #0x0 + ret +.size unroll8_eor3_aes_gcm_dec_128_kernel,.-unroll8_eor3_aes_gcm_dec_128_kernel +.globl unroll8_eor3_aes_gcm_enc_192_kernel +.type unroll8_eor3_aes_gcm_enc_192_kernel,%function +.align 4 +unroll8_eor3_aes_gcm_enc_192_kernel: + AARCH64_VALID_CALL_TARGET + cbz x1, .L192_enc_ret + stp d8, d9, [sp, #-80]! + lsr x9, x1, #3 + mov x16, x4 + mov x8, x5 + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + mov x5, #0xc200000000000000 + stp x5, xzr, [sp, #64] + add x10, sp, #64 + + mov x5, x9 + ld1 { v0.16b}, [x16] //CTR block 0 + + mov x15, #0x100000000 //set up counter increment + movi v31.16b, #0x0 + mov v31.d[1], x15 + + rev32 v30.16b, v0.16b //set up reversed counter + + add v30.4s, v30.4s, v31.4s //CTR block 0 + + rev32 v1.16b, v30.16b //CTR block 1 + add v30.4s, v30.4s, v31.4s //CTR block 1 + + rev32 v2.16b, v30.16b //CTR block 2 + add v30.4s, v30.4s, v31.4s //CTR block 2 + + rev32 v3.16b, v30.16b //CTR block 3 + add v30.4s, v30.4s, v31.4s //CTR block 3 + + rev32 v4.16b, v30.16b //CTR block 4 + add v30.4s, v30.4s, v31.4s //CTR block 4 + sub x5, x5, #1 //byte_len - 1 + + and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + rev32 v5.16b, v30.16b //CTR block 5 + add v30.4s, v30.4s, v31.4s //CTR block 5 + ldp q26, q27, [x8, #0] //load rk0, rk1 + + add x5, x5, x0 + + rev32 v6.16b, v30.16b //CTR block 6 + add v30.4s, v30.4s, v31.4s //CTR block 6 + + rev32 v7.16b, v30.16b //CTR block 7 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 0 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 0 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + ldp q28, q26, [x8, #32] //load rk2, rk3 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 1 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 1 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 1 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 2 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 1 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 2 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 2 + + ldp q27, q28, [x8, #64] //load rk4, rk5 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 3 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 3 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 3 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 3 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 4 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 4 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 4 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 4 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + ldp q26, q27, [x8, #96] //load rk6, rk7 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 5 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 5 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 5 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 5 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + + add v30.4s, v30.4s, v31.4s //CTR block 7 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 6 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 6 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 6 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 6 + ldp q28, q26, [x8, #128] //load rk8, rk9 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 7 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 7 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 7 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 7 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 8 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 8 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 8 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 8 + + add x4, x0, x1, lsr #3 //end_input_ptr + cmp x0, x5 //check if we have <= 8 blocks + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 9 + + ld1 { v19.16b}, [x3] + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + ldp q27, q28, [x8, #160] //load rk10, rk11 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 9 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 9 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 9 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 9 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 9 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 9 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 14 - round 10 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 11 - round 10 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 9 - round 10 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 13 - round 10 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 12 - round 10 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8 - round 10 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 10 - round 10 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 15 - round 10 + + aese v6.16b, v28.16b //AES block 14 - round 11 + aese v3.16b, v28.16b //AES block 11 - round 11 + + aese v4.16b, v28.16b //AES block 12 - round 11 + aese v7.16b, v28.16b //AES block 15 - round 11 + ldr q26, [x8, #192] //load rk12 + + aese v1.16b, v28.16b //AES block 9 - round 11 + aese v5.16b, v28.16b //AES block 13 - round 11 + + aese v2.16b, v28.16b //AES block 10 - round 11 + aese v0.16b, v28.16b //AES block 8 - round 11 + b.ge .L192_enc_tail //handle tail + + ldp q8, q9, [x0], #32 //AES block 0, 1 - load plaintext + + ldp q10, q11, [x0], #32 //AES block 2, 3 - load plaintext + + ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext + + ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext + +.inst 0xce006908 //eor3 v8.16b, v8.16b, v0.16b, v26.16b //AES block 0 - result + rev32 v0.16b, v30.16b //CTR block 8 + add v30.4s, v30.4s, v31.4s //CTR block 8 + +.inst 0xce03696b //eor3 v11.16b, v11.16b, v3.16b, v26.16b //AES block 3 - result +.inst 0xce016929 //eor3 v9.16b, v9.16b, v1.16b, v26.16b //AES block 1 - result + + rev32 v1.16b, v30.16b //CTR block 9 + add v30.4s, v30.4s, v31.4s //CTR block 9 +.inst 0xce04698c //eor3 v12.16b, v12.16b, v4.16b, v26.16b //AES block 4 - result + +.inst 0xce0569ad //eor3 v13.16b, v13.16b, v5.16b, v26.16b //AES block 5 - result +.inst 0xce0769ef //eor3 v15.16b, v15.16b, v7.16b, v26.16b //AES block 7 - result + stp q8, q9, [x2], #32 //AES block 0, 1 - store result + +.inst 0xce02694a //eor3 v10.16b, v10.16b, v2.16b, v26.16b //AES block 2 - result + rev32 v2.16b, v30.16b //CTR block 10 + add v30.4s, v30.4s, v31.4s //CTR block 10 + + stp q10, q11, [x2], #32 //AES block 2, 3 - store result + cmp x0, x5 //check if we have <= 8 blocks + + rev32 v3.16b, v30.16b //CTR block 11 + add v30.4s, v30.4s, v31.4s //CTR block 11 +.inst 0xce0669ce //eor3 v14.16b, v14.16b, v6.16b, v26.16b //AES block 6 - result + + stp q12, q13, [x2], #32 //AES block 4, 5 - store result + + rev32 v4.16b, v30.16b //CTR block 12 + stp q14, q15, [x2], #32 //AES block 6, 7 - store result + add v30.4s, v30.4s, v31.4s //CTR block 12 + + b.ge .L192_enc_prepretail //do prepretail + +.L192_enc_main_loop: //main loop start + rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free) + ldp q26, q27, [x8, #0] //load rk0, rk1 + rev64 v10.16b, v10.16b //GHASH block 8k+2 + + rev32 v5.16b, v30.16b //CTR block 8k+13 + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + rev64 v8.16b, v8.16b //GHASH block 8k + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + + rev64 v9.16b, v9.16b //GHASH block 8k+1 + rev32 v6.16b, v30.16b //CTR block 8k+14 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + + eor v8.16b, v8.16b, v19.16b //PRE 1 + rev64 v11.16b, v11.16b //GHASH block 8k+3 + rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free) + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + rev32 v7.16b, v30.16b //CTR block 8k+15 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + + ldp q28, q26, [x8, #32] //load rk2, rk3 + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + ldp q27, q28, [x8, #64] //load rk4, rk5 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + + ldp q26, q27, [x8, #96] //load rk6, rk7 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free) + + rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free) + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + ldp q28, q26, [x8, #128] //load rk8, rk9 + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + + ldr d16, [x10] //MODULO - load modulo constant +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 + + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + ldp q27, q28, [x8, #160] //load rk10, rk11 + +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low + rev32 v20.16b, v30.16b //CTR block 8k+16 + add v30.4s, v30.4s, v31.4s //CTR block 8k+16 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 + ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load plaintext + + pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + rev32 v22.16b, v30.16b //CTR block 8k+17 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 + +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 + add v30.4s, v30.4s, v31.4s //CTR block 8k+17 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 + ldr q26, [x8, #192] //load rk12 + ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 + ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load plaintext + + aese v4.16b, v28.16b //AES block 8k+12 - round 11 +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load plaintext + + ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load plaintext + aese v2.16b, v28.16b //AES block 8k+10 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 + + rev32 v23.16b, v30.16b //CTR block 8k+18 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 + aese v5.16b, v28.16b //AES block 8k+13 - round 11 + add v30.4s, v30.4s, v31.4s //CTR block 8k+18 + + aese v7.16b, v28.16b //AES block 8k+15 - round 11 + aese v0.16b, v28.16b //AES block 8k+8 - round 11 +.inst 0xce04698c //eor3 v12.16b, v12.16b, v4.16b, v26.16b //AES block 4 - result + + aese v6.16b, v28.16b //AES block 8k+14 - round 11 + aese v3.16b, v28.16b //AES block 8k+11 - round 11 + aese v1.16b, v28.16b //AES block 8k+9 - round 11 + + rev32 v25.16b, v30.16b //CTR block 8k+19 + add v30.4s, v30.4s, v31.4s //CTR block 8k+19 +.inst 0xce0769ef //eor3 v15.16b, v15.16b, v7.16b, v26.16b //AES block 7 - result + +.inst 0xce02694a //eor3 v10.16b, v10.16b, v2.16b, v26.16b //AES block 8k+10 - result +.inst 0xce006908 //eor3 v8.16b, v8.16b, v0.16b, v26.16b //AES block 8k+8 - result + mov v2.16b, v23.16b //CTR block 8k+18 + +.inst 0xce016929 //eor3 v9.16b, v9.16b, v1.16b, v26.16b //AES block 8k+9 - result + mov v1.16b, v22.16b //CTR block 8k+17 + stp q8, q9, [x2], #32 //AES block 8k+8, 8k+9 - store result + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + +.inst 0xce0669ce //eor3 v14.16b, v14.16b, v6.16b, v26.16b //AES block 6 - result + mov v0.16b, v20.16b //CTR block 8k+16 + rev32 v4.16b, v30.16b //CTR block 8k+20 + + add v30.4s, v30.4s, v31.4s //CTR block 8k+20 +.inst 0xce0569ad //eor3 v13.16b, v13.16b, v5.16b, v26.16b //AES block 5 - result +.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low + +.inst 0xce03696b //eor3 v11.16b, v11.16b, v3.16b, v26.16b //AES block 8k+11 - result + mov v3.16b, v25.16b //CTR block 8k+19 + + stp q10, q11, [x2], #32 //AES block 8k+10, 8k+11 - store result + + stp q12, q13, [x2], #32 //AES block 8k+12, 8k+13 - store result + + cmp x0, x5 //.LOOP CONTROL + stp q14, q15, [x2], #32 //AES block 8k+14, 8k+15 - store result + b.lt .L192_enc_main_loop + +.L192_enc_prepretail: //PREPRETAIL + rev32 v5.16b, v30.16b //CTR block 8k+13 + ldp q26, q27, [x8, #0] //load rk0, rk1 + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + rev64 v8.16b, v8.16b //GHASH block 8k + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + + rev32 v6.16b, v30.16b //CTR block 8k+14 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + + rev64 v11.16b, v11.16b //GHASH block 8k+3 + rev64 v10.16b, v10.16b //GHASH block 8k+2 + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + + eor v8.16b, v8.16b, v19.16b //PRE 1 + rev32 v7.16b, v30.16b //CTR block 8k+15 + rev64 v9.16b, v9.16b //GHASH block 8k+1 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + ldp q28, q26, [x8, #32] //load rk2, rk3 + + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free) + rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free) + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + ldp q27, q28, [x8, #64] //load rk4, rk5 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free) + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free) + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + ldp q26, q27, [x8, #96] //load rk6, rk7 + + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + ldr d16, [x10] //MODULO - load modulo constant + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 + + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 + ldp q28, q26, [x8, #128] //load rk8, rk9 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high + +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 + ldp q27, q28, [x8, #160] //load rk10, rk11 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 + + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + ldr q26, [x8, #192] //load rk12 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 + +.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 + + aese v1.16b, v28.16b //AES block 8k+9 - round 11 + aese v7.16b, v28.16b //AES block 8k+15 - round 11 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 + aese v3.16b, v28.16b //AES block 8k+11 - round 11 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 + + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + aese v2.16b, v28.16b //AES block 8k+10 - round 11 + aese v0.16b, v28.16b //AES block 8k+8 - round 11 + + aese v6.16b, v28.16b //AES block 8k+14 - round 11 + aese v4.16b, v28.16b //AES block 8k+12 - round 11 + aese v5.16b, v28.16b //AES block 8k+13 - round 11 + +.L192_enc_tail: //TAIL + + ldp q20, q21, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + + ldr q8, [x0], #16 //AES block 8k+8 - l3ad plaintext + + ldp q24, q25, [x3, #192] //load h8k | h7k + ext v25.16b, v25.16b, v25.16b, #8 + + mov v29.16b, v26.16b + + ldp q22, q23, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + ext v23.16b, v23.16b, v23.16b, #8 + cmp x5, #112 + +.inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result + ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag + b.gt .L192_enc_blocks_more_than_7 + + cmp x5, #96 + mov v7.16b, v6.16b + movi v17.8b, #0 + + mov v6.16b, v5.16b + movi v19.8b, #0 + sub v30.4s, v30.4s, v31.4s + + mov v5.16b, v4.16b + mov v4.16b, v3.16b + mov v3.16b, v2.16b + + mov v2.16b, v1.16b + movi v18.8b, #0 + b.gt .L192_enc_blocks_more_than_6 + + mov v7.16b, v6.16b + cmp x5, #80 + + mov v6.16b, v5.16b + mov v5.16b, v4.16b + mov v4.16b, v3.16b + + mov v3.16b, v1.16b + sub v30.4s, v30.4s, v31.4s + b.gt .L192_enc_blocks_more_than_5 + + cmp x5, #64 + sub v30.4s, v30.4s, v31.4s + + mov v7.16b, v6.16b + mov v6.16b, v5.16b + mov v5.16b, v4.16b + + mov v4.16b, v1.16b + b.gt .L192_enc_blocks_more_than_4 + + mov v7.16b, v6.16b + mov v6.16b, v5.16b + mov v5.16b, v1.16b + + sub v30.4s, v30.4s, v31.4s + cmp x5, #48 + b.gt .L192_enc_blocks_more_than_3 + + mov v7.16b, v6.16b + mov v6.16b, v1.16b + sub v30.4s, v30.4s, v31.4s + + ldr q24, [x3, #96] //load h4k | h3k + cmp x5, #32 + b.gt .L192_enc_blocks_more_than_2 + + sub v30.4s, v30.4s, v31.4s + + cmp x5, #16 + mov v7.16b, v1.16b + b.gt .L192_enc_blocks_more_than_1 + + sub v30.4s, v30.4s, v31.4s + ldr q21, [x3, #48] //load h2k | h1k + b .L192_enc_blocks_less_than_1 +.L192_enc_blocks_more_than_7: //blocks left > 7 + st1 { v9.16b}, [x2], #16 //AES final-7 block - store result + + rev64 v8.16b, v9.16b //GHASH final-7 block + ins v18.d[0], v24.d[1] //GHASH final-7 block - mid + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-7 block - mid + + ldr q9, [x0], #16 //AES final-6 block - load plaintext + + eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid + movi v16.8b, #0 //suppress further partial tag feed in + pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low + + pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high + + pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid +.inst 0xce017529 //eor3 v9.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result +.L192_enc_blocks_more_than_6: //blocks left > 6 + + st1 { v9.16b}, [x2], #16 //AES final-6 block - store result + + rev64 v8.16b, v9.16b //GHASH final-6 block + + ldr q9, [x0], #16 //AES final-5 block - load plaintext + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-6 block - mid + + pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low +.inst 0xce027529 //eor3 v9.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result + + movi v16.8b, #0 //suppress further partial tag feed in + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high + eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid + + pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid + + eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high + eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid +.L192_enc_blocks_more_than_5: //blocks left > 5 + + st1 { v9.16b}, [x2], #16 //AES final-5 block - store result + + rev64 v8.16b, v9.16b //GHASH final-5 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-5 block - mid + + ldr q9, [x0], #16 //AES final-4 block - load plaintext + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high + + eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high + + ins v27.d[1], v27.d[0] //GHASH final-5 block - mid + pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low + + eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid + +.inst 0xce037529 //eor3 v9.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result + movi v16.8b, #0 //suppress further partial tag feed in + + eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid +.L192_enc_blocks_more_than_4: //blocks left > 4 + + st1 { v9.16b}, [x2], #16 //AES final-4 block - store result + + rev64 v8.16b, v9.16b //GHASH final-4 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ldr q9, [x0], #16 //AES final-3 block - load plaintext + pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high + ins v27.d[0], v8.d[1] //GHASH final-4 block - mid + + pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low + eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high + + eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid + + movi v16.8b, #0 //suppress further partial tag feed in + eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low + + pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid + + eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid +.inst 0xce047529 //eor3 v9.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result +.L192_enc_blocks_more_than_3: //blocks left > 3 + + ldr q24, [x3, #96] //load h4k | h3k + st1 { v9.16b}, [x2], #16 //AES final-3 block - store result + + rev64 v8.16b, v9.16b //GHASH final-3 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + movi v16.8b, #0 //suppress further partial tag feed in + + ldr q9, [x0], #16 //AES final-2 block - load plaintext + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + + ins v27.d[0], v8.d[1] //GHASH final-3 block - mid + +.inst 0xce057529 //eor3 v9.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result + eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid + + ins v27.d[1], v27.d[0] //GHASH final-3 block - mid + pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low + + pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high + pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid + + eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high +.L192_enc_blocks_more_than_2: //blocks left > 2 + + st1 { v9.16b}, [x2], #16 //AES final-2 block - store result + + rev64 v8.16b, v9.16b //GHASH final-2 block + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ldr q9, [x0], #16 //AES final-1 block - load plaintext + ins v27.d[0], v8.d[1] //GHASH final-2 block - mid + + eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid + + pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high + movi v16.8b, #0 //suppress further partial tag feed in + + pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid + + eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low + eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high + + eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid +.inst 0xce067529 //eor3 v9.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result +.L192_enc_blocks_more_than_1: //blocks left > 1 + + ldr q22, [x3, #64] //load h1l | h1h + ext v22.16b, v22.16b, v22.16b, #8 + st1 { v9.16b}, [x2], #16 //AES final-1 block - store result + + rev64 v8.16b, v9.16b //GHASH final-1 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-1 block - mid + pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low + + eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high + eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid + + ldr q9, [x0], #16 //AES final block - load plaintext + ldr q21, [x3, #48] //load h2k | h1k + + ins v27.d[1], v27.d[0] //GHASH final-1 block - mid + +.inst 0xce077529 //eor3 v9.16b, v9.16b, v7.16b, v29.16b //AES final block - result + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid + + movi v16.8b, #0 //suppress further partial tag feed in + + eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high +.L192_enc_blocks_less_than_1: //blocks left <= 1 + + mvn x6, xzr //temp0_x = 0xffffffffffffffff + and x1, x1, #127 //bit_length %= 128 + + sub x1, x1, #128 //bit_length -= 128 + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + + and x1, x1, #127 //bit_length %= 128 + + lsr x6, x6, x1 //temp0_x is mask for top 64b of last block + cmp x1, #64 + mvn x7, xzr //temp1_x = 0xffffffffffffffff + + csel x13, x7, x6, lt + csel x14, x6, xzr, lt + + mov v0.d[1], x14 + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + + ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored + mov v0.d[0], x13 //ctr0b is mask for last block + + and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits + + rev64 v8.16b, v9.16b //GHASH final block + bif v9.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing + + st1 { v9.16b}, [x2] //store all 16B + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v16.d[0], v8.d[1] //GHASH final block - mid + pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high + + eor v17.16b, v17.16b, v28.16b //GHASH final block - high + pmull v26.1q, v8.1d, v20.1d //GHASH final block - low + + eor v16.8b, v16.8b, v8.8b //GHASH final block - mid + + pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid + + eor v18.16b, v18.16b, v16.16b //GHASH final block - mid + ldr d16, [x10] //MODULO - load modulo constant + + eor v19.16b, v19.16b, v26.16b //GHASH final block - low + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + + rev32 v30.16b, v30.16b + + str q30, [x16] //store the updated counter +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + +.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + st1 { v19.16b }, [x3] + + mov x0, x9 //return sizes + + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + ldp d8, d9, [sp], #80 + ret + +.L192_enc_ret: + mov w0, #0x0 + ret +.size unroll8_eor3_aes_gcm_enc_192_kernel,.-unroll8_eor3_aes_gcm_enc_192_kernel +.globl unroll8_eor3_aes_gcm_dec_192_kernel +.type unroll8_eor3_aes_gcm_dec_192_kernel,%function +.align 4 +unroll8_eor3_aes_gcm_dec_192_kernel: + AARCH64_VALID_CALL_TARGET + cbz x1, .L192_dec_ret + stp d8, d9, [sp, #-80]! + lsr x9, x1, #3 + mov x16, x4 + mov x8, x5 + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + mov x5, #0xc200000000000000 + stp x5, xzr, [sp, #64] + add x10, sp, #64 + + mov x5, x9 + ld1 { v0.16b}, [x16] //CTR block 0 + ld1 { v19.16b}, [x3] + + mov x15, #0x100000000 //set up counter increment + movi v31.16b, #0x0 + mov v31.d[1], x15 + + rev32 v30.16b, v0.16b //set up reversed counter + + add v30.4s, v30.4s, v31.4s //CTR block 0 + + rev32 v1.16b, v30.16b //CTR block 1 + add v30.4s, v30.4s, v31.4s //CTR block 1 + + rev32 v2.16b, v30.16b //CTR block 2 + add v30.4s, v30.4s, v31.4s //CTR block 2 + + rev32 v3.16b, v30.16b //CTR block 3 + add v30.4s, v30.4s, v31.4s //CTR block 3 + + rev32 v4.16b, v30.16b //CTR block 4 + add v30.4s, v30.4s, v31.4s //CTR block 4 + + rev32 v5.16b, v30.16b //CTR block 5 + add v30.4s, v30.4s, v31.4s //CTR block 5 + ldp q26, q27, [x8, #0] //load rk0, rk1 + + rev32 v6.16b, v30.16b //CTR block 6 + add v30.4s, v30.4s, v31.4s //CTR block 6 + + rev32 v7.16b, v30.16b //CTR block 7 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 0 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 0 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 0 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 0 + ldp q28, q26, [x8, #32] //load rk2, rk3 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 1 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 1 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 1 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 2 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 1 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 2 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 2 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 2 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 3 + + ldp q27, q28, [x8, #64] //load rk4, rk5 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 3 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 3 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 3 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 4 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 4 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 4 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 5 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 4 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 5 + ldp q26, q27, [x8, #96] //load rk6, rk7 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 5 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 5 + + sub x5, x5, #1 //byte_len - 1 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 6 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 6 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 6 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 6 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + ldp q28, q26, [x8, #128] //load rk8, rk9 + + add v30.4s, v30.4s, v31.4s //CTR block 7 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 7 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 7 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 7 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 7 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 8 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 8 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 8 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 8 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + + add x4, x0, x1, lsr #3 //end_input_ptr + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 9 + + ld1 { v19.16b}, [x3] + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + + ldp q27, q28, [x8, #160] //load rk10, rk11 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 9 + add x5, x5, x0 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 9 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 9 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 9 + + cmp x0, x5 //check if we have <= 8 blocks + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 9 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 9 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 9 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 10 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 10 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 10 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 10 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 10 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 10 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 10 + ldr q26, [x8, #192] //load rk12 + + aese v0.16b, v28.16b //AES block 0 - round 11 + aese v1.16b, v28.16b //AES block 1 - round 11 + aese v4.16b, v28.16b //AES block 4 - round 11 + + aese v6.16b, v28.16b //AES block 6 - round 11 + aese v5.16b, v28.16b //AES block 5 - round 11 + aese v7.16b, v28.16b //AES block 7 - round 11 + + aese v2.16b, v28.16b //AES block 2 - round 11 + aese v3.16b, v28.16b //AES block 3 - round 11 + b.ge .L192_dec_tail //handle tail + + ldp q8, q9, [x0], #32 //AES block 0, 1 - load ciphertext + + ldp q10, q11, [x0], #32 //AES block 2, 3 - load ciphertext + + ldp q12, q13, [x0], #32 //AES block 4, 5 - load ciphertext + +.inst 0xce016921 //eor3 v1.16b, v9.16b, v1.16b, v26.16b //AES block 1 - result +.inst 0xce006900 //eor3 v0.16b, v8.16b, v0.16b, v26.16b //AES block 0 - result + stp q0, q1, [x2], #32 //AES block 0, 1 - store result + + rev32 v0.16b, v30.16b //CTR block 8 + add v30.4s, v30.4s, v31.4s //CTR block 8 + + rev32 v1.16b, v30.16b //CTR block 9 + add v30.4s, v30.4s, v31.4s //CTR block 9 +.inst 0xce036963 //eor3 v3.16b, v11.16b, v3.16b, v26.16b //AES block 3 - result + +.inst 0xce026942 //eor3 v2.16b, v10.16b, v2.16b, v26.16b //AES block 2 - result + stp q2, q3, [x2], #32 //AES block 2, 3 - store result + ldp q14, q15, [x0], #32 //AES block 6, 7 - load ciphertext + + rev32 v2.16b, v30.16b //CTR block 10 + add v30.4s, v30.4s, v31.4s //CTR block 10 + +.inst 0xce046984 //eor3 v4.16b, v12.16b, v4.16b, v26.16b //AES block 4 - result + + rev32 v3.16b, v30.16b //CTR block 11 + add v30.4s, v30.4s, v31.4s //CTR block 11 + +.inst 0xce0569a5 //eor3 v5.16b, v13.16b, v5.16b, v26.16b //AES block 5 - result + stp q4, q5, [x2], #32 //AES block 4, 5 - store result + cmp x0, x5 //check if we have <= 8 blocks + +.inst 0xce0669c6 //eor3 v6.16b, v14.16b, v6.16b, v26.16b //AES block 6 - result +.inst 0xce0769e7 //eor3 v7.16b, v15.16b, v7.16b, v26.16b //AES block 7 - result + rev32 v4.16b, v30.16b //CTR block 12 + + add v30.4s, v30.4s, v31.4s //CTR block 12 + stp q6, q7, [x2], #32 //AES block 6, 7 - store result + b.ge .L192_dec_prepretail //do prepretail + +.L192_dec_main_loop: //main loop start + rev64 v9.16b, v9.16b //GHASH block 8k+1 + ldp q26, q27, [x8, #0] //load rk0, rk1 + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + + rev64 v8.16b, v8.16b //GHASH block 8k + rev32 v5.16b, v30.16b //CTR block 8k+13 + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + rev64 v12.16b, v12.16b //GHASH block 8k+4 + rev64 v11.16b, v11.16b //GHASH block 8k+3 + + eor v8.16b, v8.16b, v19.16b //PRE 1 + rev32 v6.16b, v30.16b //CTR block 8k+14 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + + rev64 v13.16b, v13.16b //GHASH block 8k+5 + + rev32 v7.16b, v30.16b //CTR block 8k+15 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + ldp q28, q26, [x8, #32] //load rk2, rk3 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + rev64 v10.16b, v10.16b //GHASH block 8k+2 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + ldp q27, q28, [x8, #64] //load rk4, rk5 + + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + + ldp q26, q27, [x8, #96] //load rk6, rk7 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + rev64 v15.16b, v15.16b //GHASH block 8k+7 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + rev64 v14.16b, v14.16b //GHASH block 8k+6 + + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + ldp q28, q26, [x8, #128] //load rk8, rk9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high + + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + ldr d16, [x10] //MODULO - load modulo constant + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high + rev32 v20.16b, v30.16b //CTR block 8k+16 + add v30.4s, v30.4s, v31.4s //CTR block 8k+16 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 + ldp q27, q28, [x8, #160] //load rk10, rk11 + +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load ciphertext + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 + ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load ciphertext + + rev32 v22.16b, v30.16b //CTR block 8k+17 + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + add v30.4s, v30.4s, v31.4s //CTR block 8k+17 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 + ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load ciphertext + + rev32 v23.16b, v30.16b //CTR block 8k+18 + add v30.4s, v30.4s, v31.4s //CTR block 8k+18 +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 + ldr q26, [x8, #192] //load rk12 + + ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load ciphertext + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 + + aese v0.16b, v28.16b //AES block 8k+8 - round 11 + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + aese v1.16b, v28.16b //AES block 8k+9 - round 11 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 + aese v6.16b, v28.16b //AES block 8k+14 - round 11 + aese v3.16b, v28.16b //AES block 8k+11 - round 11 + +.inst 0xce006900 //eor3 v0.16b, v8.16b, v0.16b, v26.16b //AES block 8k+8 - result + rev32 v25.16b, v30.16b //CTR block 8k+19 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 + + aese v4.16b, v28.16b //AES block 8k+12 - round 11 + aese v2.16b, v28.16b //AES block 8k+10 - round 11 + add v30.4s, v30.4s, v31.4s //CTR block 8k+19 + + aese v7.16b, v28.16b //AES block 8k+15 - round 11 + aese v5.16b, v28.16b //AES block 8k+13 - round 11 + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + +.inst 0xce016921 //eor3 v1.16b, v9.16b, v1.16b, v26.16b //AES block 8k+9 - result + stp q0, q1, [x2], #32 //AES block 8k+8, 8k+9 - store result +.inst 0xce036963 //eor3 v3.16b, v11.16b, v3.16b, v26.16b //AES block 8k+11 - result + +.inst 0xce026942 //eor3 v2.16b, v10.16b, v2.16b, v26.16b //AES block 8k+10 - result +.inst 0xce0769e7 //eor3 v7.16b, v15.16b, v7.16b, v26.16b //AES block 8k+15 - result + stp q2, q3, [x2], #32 //AES block 8k+10, 8k+11 - store result + +.inst 0xce0569a5 //eor3 v5.16b, v13.16b, v5.16b, v26.16b //AES block 8k+13 - result +.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low + mov v3.16b, v25.16b //CTR block 8k+19 + +.inst 0xce046984 //eor3 v4.16b, v12.16b, v4.16b, v26.16b //AES block 8k+12 - result + stp q4, q5, [x2], #32 //AES block 8k+12, 8k+13 - store result + cmp x0, x5 //.LOOP CONTROL + +.inst 0xce0669c6 //eor3 v6.16b, v14.16b, v6.16b, v26.16b //AES block 8k+14 - result + stp q6, q7, [x2], #32 //AES block 8k+14, 8k+15 - store result + mov v0.16b, v20.16b //CTR block 8k+16 + + mov v1.16b, v22.16b //CTR block 8k+17 + mov v2.16b, v23.16b //CTR block 8k+18 + + rev32 v4.16b, v30.16b //CTR block 8k+20 + add v30.4s, v30.4s, v31.4s //CTR block 8k+20 + b.lt .L192_dec_main_loop + +.L192_dec_prepretail: //PREPRETAIL + ldp q26, q27, [x8, #0] //load rk0, rk1 + rev32 v5.16b, v30.16b //CTR block 8k+13 + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + rev64 v8.16b, v8.16b //GHASH block 8k + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + + rev64 v11.16b, v11.16b //GHASH block 8k+3 + rev32 v6.16b, v30.16b //CTR block 8k+14 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + + eor v8.16b, v8.16b, v19.16b //PRE 1 + rev64 v10.16b, v10.16b //GHASH block 8k+2 + rev64 v9.16b, v9.16b //GHASH block 8k+1 + + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + rev32 v7.16b, v30.16b //CTR block 8k+15 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + ldp q28, q26, [x8, #32] //load rk2, rk3 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + rev64 v13.16b, v13.16b //GHASH block 8k+5 + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + ldp q27, q28, [x8, #64] //load rk4, rk5 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + + rev64 v15.16b, v15.16b //GHASH block 8k+7 + +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + rev64 v12.16b, v12.16b //GHASH block 8k+4 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + + rev64 v14.16b, v14.16b //GHASH block 8k+6 + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + + ldp q26, q27, [x8, #96] //load rk6, rk7 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high + +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + + ldp q28, q26, [x8, #128] //load rk8, rk9 + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 + + ldr d16, [x10] //MODULO - load modulo constant +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + ldp q27, q28, [x8, #160] //load rk10, rk11 + +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + ldr q26, [x8, #192] //load rk12 + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 + + aese v0.16b, v28.16b //AES block 8k+8 - round 11 +.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low + aese v5.16b, v28.16b //AES block 8k+13 - round 11 + + aese v2.16b, v28.16b //AES block 8k+10 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 + + aese v6.16b, v28.16b //AES block 8k+14 - round 11 + aese v4.16b, v28.16b //AES block 8k+12 - round 11 + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + + aese v3.16b, v28.16b //AES block 8k+11 - round 11 + aese v1.16b, v28.16b //AES block 8k+9 - round 11 + aese v7.16b, v28.16b //AES block 8k+15 - round 11 + +.L192_dec_tail: //TAIL + + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + + ldp q20, q21, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext + + ldp q24, q25, [x3, #192] //load h8k | h7k + ext v25.16b, v25.16b, v25.16b, #8 + + mov v29.16b, v26.16b + + ldp q22, q23, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + ext v23.16b, v23.16b, v23.16b, #8 + ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag + +.inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result + cmp x5, #112 + b.gt .L192_dec_blocks_more_than_7 + + mov v7.16b, v6.16b + movi v17.8b, #0 + sub v30.4s, v30.4s, v31.4s + + mov v6.16b, v5.16b + mov v5.16b, v4.16b + mov v4.16b, v3.16b + + cmp x5, #96 + movi v19.8b, #0 + mov v3.16b, v2.16b + + mov v2.16b, v1.16b + movi v18.8b, #0 + b.gt .L192_dec_blocks_more_than_6 + + mov v7.16b, v6.16b + mov v6.16b, v5.16b + mov v5.16b, v4.16b + + mov v4.16b, v3.16b + mov v3.16b, v1.16b + + sub v30.4s, v30.4s, v31.4s + cmp x5, #80 + b.gt .L192_dec_blocks_more_than_5 + + mov v7.16b, v6.16b + mov v6.16b, v5.16b + + mov v5.16b, v4.16b + mov v4.16b, v1.16b + cmp x5, #64 + + sub v30.4s, v30.4s, v31.4s + b.gt .L192_dec_blocks_more_than_4 + + sub v30.4s, v30.4s, v31.4s + mov v7.16b, v6.16b + mov v6.16b, v5.16b + + mov v5.16b, v1.16b + cmp x5, #48 + b.gt .L192_dec_blocks_more_than_3 + + sub v30.4s, v30.4s, v31.4s + mov v7.16b, v6.16b + cmp x5, #32 + + mov v6.16b, v1.16b + ldr q24, [x3, #96] //load h4k | h3k + b.gt .L192_dec_blocks_more_than_2 + + sub v30.4s, v30.4s, v31.4s + + mov v7.16b, v1.16b + cmp x5, #16 + b.gt .L192_dec_blocks_more_than_1 + + sub v30.4s, v30.4s, v31.4s + ldr q21, [x3, #48] //load h2k | h1k + b .L192_dec_blocks_less_than_1 +.L192_dec_blocks_more_than_7: //blocks left > 7 + rev64 v8.16b, v9.16b //GHASH final-7 block + + ins v18.d[0], v24.d[1] //GHASH final-7 block - mid + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high + ins v27.d[0], v8.d[1] //GHASH final-7 block - mid + ldr q9, [x0], #16 //AES final-6 block - load ciphertext + + pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low + + eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid + st1 { v12.16b}, [x2], #16 //AES final-7 block - store result + +.inst 0xce01752c //eor3 v12.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result + + pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid + movi v16.8b, #0 //suppress further partial tag feed in +.L192_dec_blocks_more_than_6: //blocks left > 6 + + rev64 v8.16b, v9.16b //GHASH final-6 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ldr q9, [x0], #16 //AES final-5 block - load ciphertext + ins v27.d[0], v8.d[1] //GHASH final-6 block - mid + + eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid + movi v16.8b, #0 //suppress further partial tag feed in + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high + + st1 { v12.16b}, [x2], #16 //AES final-6 block - store result +.inst 0xce02752c //eor3 v12.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result + + eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high + pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid + pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid + eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low +.L192_dec_blocks_more_than_5: //blocks left > 5 + + rev64 v8.16b, v9.16b //GHASH final-5 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-5 block - mid + + eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid + + ins v27.d[1], v27.d[0] //GHASH final-5 block - mid + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high + + ldr q9, [x0], #16 //AES final-4 block - load ciphertext + + eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high + pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low + + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid + + eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low + movi v16.8b, #0 //suppress further partial tag feed in + st1 { v12.16b}, [x2], #16 //AES final-5 block - store result + + eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid +.inst 0xce03752c //eor3 v12.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result +.L192_dec_blocks_more_than_4: //blocks left > 4 + + rev64 v8.16b, v9.16b //GHASH final-4 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + movi v16.8b, #0 //suppress further partial tag feed in + + ldr q9, [x0], #16 //AES final-3 block - load ciphertext + ins v27.d[0], v8.d[1] //GHASH final-4 block - mid + pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low + + eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid + + eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low + + pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid + st1 { v12.16b}, [x2], #16 //AES final-4 block - store result + pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high + +.inst 0xce04752c //eor3 v12.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result + + eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high +.L192_dec_blocks_more_than_3: //blocks left > 3 + + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + rev64 v8.16b, v9.16b //GHASH final-3 block + ldr q9, [x0], #16 //AES final-2 block - load ciphertext + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-3 block - mid + pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high + + eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high + movi v16.8b, #0 //suppress further partial tag feed in + pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low + + st1 { v12.16b}, [x2], #16 //AES final-3 block - store result + eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid +.inst 0xce05752c //eor3 v12.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result + + eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low + ldr q24, [x3, #96] //load h4k | h3k + + ins v27.d[1], v27.d[0] //GHASH final-3 block - mid + + pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid + + eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid +.L192_dec_blocks_more_than_2: //blocks left > 2 + + rev64 v8.16b, v9.16b //GHASH final-2 block + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-2 block - mid + ldr q9, [x0], #16 //AES final-1 block - load ciphertext + + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high + + eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid + + eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high + pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low + + pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid + movi v16.8b, #0 //suppress further partial tag feed in + + eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low + st1 { v12.16b}, [x2], #16 //AES final-2 block - store result + + eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid +.inst 0xce06752c //eor3 v12.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result +.L192_dec_blocks_more_than_1: //blocks left > 1 + + rev64 v8.16b, v9.16b //GHASH final-1 block + ldr q9, [x0], #16 //AES final block - load ciphertext + ldr q22, [x3, #64] //load h1l | h1h + ext v22.16b, v22.16b, v22.16b, #8 + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + movi v16.8b, #0 //suppress further partial tag feed in + ldr q21, [x3, #48] //load h2k | h1k + + pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low + ins v27.d[0], v8.d[1] //GHASH final-1 block - mid + st1 { v12.16b}, [x2], #16 //AES final-1 block - store result + + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high + +.inst 0xce07752c //eor3 v12.16b, v9.16b, v7.16b, v29.16b //AES final block - result + + eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid + + ins v27.d[1], v27.d[0] //GHASH final-1 block - mid + + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid + + eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high +.L192_dec_blocks_less_than_1: //blocks left <= 1 + + rev32 v30.16b, v30.16b + and x1, x1, #127 //bit_length %= 128 + + sub x1, x1, #128 //bit_length -= 128 + str q30, [x16] //store the updated counter + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + mvn x6, xzr //temp0_x = 0xffffffffffffffff + + and x1, x1, #127 //bit_length %= 128 + + mvn x7, xzr //temp1_x = 0xffffffffffffffff + lsr x6, x6, x1 //temp0_x is mask for top 64b of last block + cmp x1, #64 + + csel x13, x7, x6, lt + csel x14, x6, xzr, lt + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + + mov v0.d[1], x14 + ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored + + mov v0.d[0], x13 //ctr0b is mask for last block + + and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits + bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing + + rev64 v8.16b, v9.16b //GHASH final block + + st1 { v12.16b}, [x2] //store all 16B + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v16.d[0], v8.d[1] //GHASH final block - mid + pmull v26.1q, v8.1d, v20.1d //GHASH final block - low + + eor v16.8b, v16.8b, v8.8b //GHASH final block - mid + pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high + eor v19.16b, v19.16b, v26.16b //GHASH final block - low + + pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final block - high + + eor v14.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + eor v18.16b, v18.16b, v16.16b //GHASH final block - mid + ldr d16, [x10] //MODULO - load modulo constant + + pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + ext v17.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + + eor v18.16b, v18.16b, v14.16b //MODULO - karatsuba tidy up + +.inst 0xce115652 //eor3 v18.16b, v18.16b, v17.16b, v21.16b //MODULO - fold into mid + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + +.inst 0xce124673 //eor3 v19.16b, v19.16b, v18.16b, v17.16b //MODULO - fold into low + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + st1 { v19.16b }, [x3] + + mov x0, x9 + + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + ldp d8, d9, [sp], #80 + ret + +.L192_dec_ret: + mov w0, #0x0 + ret +.size unroll8_eor3_aes_gcm_dec_192_kernel,.-unroll8_eor3_aes_gcm_dec_192_kernel +.globl unroll8_eor3_aes_gcm_enc_256_kernel +.type unroll8_eor3_aes_gcm_enc_256_kernel,%function +.align 4 +unroll8_eor3_aes_gcm_enc_256_kernel: + AARCH64_VALID_CALL_TARGET + cbz x1, .L256_enc_ret + stp d8, d9, [sp, #-80]! + lsr x9, x1, #3 + mov x16, x4 + mov x8, x5 + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + mov x5, #0xc200000000000000 + stp x5, xzr, [sp, #64] + add x10, sp, #64 + + ld1 { v0.16b}, [x16] //CTR block 0 + + mov x5, x9 + + mov x15, #0x100000000 //set up counter increment + movi v31.16b, #0x0 + mov v31.d[1], x15 + sub x5, x5, #1 //byte_len - 1 + + and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + add x5, x5, x0 + + rev32 v30.16b, v0.16b //set up reversed counter + + add v30.4s, v30.4s, v31.4s //CTR block 0 + + rev32 v1.16b, v30.16b //CTR block 1 + add v30.4s, v30.4s, v31.4s //CTR block 1 + + rev32 v2.16b, v30.16b //CTR block 2 + add v30.4s, v30.4s, v31.4s //CTR block 2 + + rev32 v3.16b, v30.16b //CTR block 3 + add v30.4s, v30.4s, v31.4s //CTR block 3 + + rev32 v4.16b, v30.16b //CTR block 4 + add v30.4s, v30.4s, v31.4s //CTR block 4 + + rev32 v5.16b, v30.16b //CTR block 5 + add v30.4s, v30.4s, v31.4s //CTR block 5 + ldp q26, q27, [x8, #0] //load rk0, rk1 + + rev32 v6.16b, v30.16b //CTR block 6 + add v30.4s, v30.4s, v31.4s //CTR block 6 + + rev32 v7.16b, v30.16b //CTR block 7 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 0 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 0 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 0 + ldp q28, q26, [x8, #32] //load rk2, rk3 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 1 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 1 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 1 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 2 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 2 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 2 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 2 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 3 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + ldp q27, q28, [x8, #64] //load rk4, rk5 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 3 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 3 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 3 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 4 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 4 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 4 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 4 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + ldp q26, q27, [x8, #96] //load rk6, rk7 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 5 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 5 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 5 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 5 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 6 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 6 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 6 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 6 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + ldp q28, q26, [x8, #128] //load rk8, rk9 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 7 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 7 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 7 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 7 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 8 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 8 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 8 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + + ld1 { v19.16b}, [x3] + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + ldp q27, q28, [x8, #160] //load rk10, rk11 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 9 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 9 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 9 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 9 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 9 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 9 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 9 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 10 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 10 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 9 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 10 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 10 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 10 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 10 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 10 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 11 + ldp q26, q27, [x8, #192] //load rk12, rk13 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 11 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 11 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 11 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 11 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 11 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 11 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 11 + + add v30.4s, v30.4s, v31.4s //CTR block 7 + ldr q28, [x8, #224] //load rk14 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 12 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 12 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 12 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 12 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 12 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 12 + + aese v2.16b, v27.16b //AES block 2 - round 13 + aese v1.16b, v27.16b //AES block 1 - round 13 + aese v4.16b, v27.16b //AES block 4 - round 13 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 12 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 12 + + aese v0.16b, v27.16b //AES block 0 - round 13 + aese v5.16b, v27.16b //AES block 5 - round 13 + + aese v6.16b, v27.16b //AES block 6 - round 13 + aese v7.16b, v27.16b //AES block 7 - round 13 + aese v3.16b, v27.16b //AES block 3 - round 13 + + add x4, x0, x1, lsr #3 //end_input_ptr + cmp x0, x5 //check if we have <= 8 blocks + b.ge .L256_enc_tail //handle tail + + ldp q8, q9, [x0], #32 //AES block 0, 1 - load plaintext + + ldp q10, q11, [x0], #32 //AES block 2, 3 - load plaintext + +.inst 0xce007108 //eor3 v8.16b, v8.16b, v0.16b, v28.16b //AES block 0 - result + rev32 v0.16b, v30.16b //CTR block 8 + add v30.4s, v30.4s, v31.4s //CTR block 8 + +.inst 0xce017129 //eor3 v9.16b, v9.16b, v1.16b, v28.16b //AES block 1 - result +.inst 0xce03716b //eor3 v11.16b, v11.16b, v3.16b, v28.16b //AES block 3 - result + + rev32 v1.16b, v30.16b //CTR block 9 + add v30.4s, v30.4s, v31.4s //CTR block 9 + ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext + + ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext +.inst 0xce02714a //eor3 v10.16b, v10.16b, v2.16b, v28.16b //AES block 2 - result + cmp x0, x5 //check if we have <= 8 blocks + + rev32 v2.16b, v30.16b //CTR block 10 + add v30.4s, v30.4s, v31.4s //CTR block 10 + stp q8, q9, [x2], #32 //AES block 0, 1 - store result + + stp q10, q11, [x2], #32 //AES block 2, 3 - store result + + rev32 v3.16b, v30.16b //CTR block 11 + add v30.4s, v30.4s, v31.4s //CTR block 11 + +.inst 0xce04718c //eor3 v12.16b, v12.16b, v4.16b, v28.16b //AES block 4 - result + +.inst 0xce0771ef //eor3 v15.16b, v15.16b, v7.16b, v28.16b //AES block 7 - result +.inst 0xce0671ce //eor3 v14.16b, v14.16b, v6.16b, v28.16b //AES block 6 - result +.inst 0xce0571ad //eor3 v13.16b, v13.16b, v5.16b, v28.16b //AES block 5 - result + + stp q12, q13, [x2], #32 //AES block 4, 5 - store result + rev32 v4.16b, v30.16b //CTR block 12 + + stp q14, q15, [x2], #32 //AES block 6, 7 - store result + add v30.4s, v30.4s, v31.4s //CTR block 12 + b.ge .L256_enc_prepretail //do prepretail + +.L256_enc_main_loop: //main loop start + ldp q26, q27, [x8, #0] //load rk0, rk1 + + rev32 v5.16b, v30.16b //CTR block 8k+13 + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + + rev64 v11.16b, v11.16b //GHASH block 8k+3 + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + rev64 v9.16b, v9.16b //GHASH block 8k+1 + + rev32 v6.16b, v30.16b //CTR block 8k+14 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + rev64 v8.16b, v8.16b //GHASH block 8k + + rev64 v12.16b, v12.16b //GHASH block 8k+4 + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + rev32 v7.16b, v30.16b //CTR block 8k+15 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + + ldp q28, q26, [x8, #32] //load rk2, rk3 + eor v8.16b, v8.16b, v19.16b //PRE 1 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + rev64 v14.16b, v14.16b //GHASH block 8k+6 + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + ldp q27, q28, [x8, #64] //load rk4, rk5 + rev64 v10.16b, v10.16b //GHASH block 8k+2 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + rev64 v13.16b, v13.16b //GHASH block 8k+5 + + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + ldp q26, q27, [x8, #96] //load rk6, rk7 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + rev64 v15.16b, v15.16b //GHASH block 8k+7 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + + ldp q28, q26, [x8, #128] //load rk8, rk9 + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 + + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 + + ldp q27, q28, [x8, #160] //load rk10, rk11 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 + + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low + + ldr d16, [x10] //MODULO - load modulo constant + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 + +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 + +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high + + ldp q26, q27, [x8, #192] //load rk12, rk13 + rev32 v20.16b, v30.16b //CTR block 8k+16 + + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load plaintext + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 11 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 11 + add v30.4s, v30.4s, v31.4s //CTR block 8k+16 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 11 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 11 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 11 + + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 11 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 12 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 11 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 12 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 12 + rev32 v22.16b, v30.16b //CTR block 8k+17 + + add v30.4s, v30.4s, v31.4s //CTR block 8k+17 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 11 +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 12 + ldr q28, [x8, #224] //load rk14 + aese v7.16b, v27.16b //AES block 8k+15 - round 13 + + ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load plaintext + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 12 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 12 + +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 12 + ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext + + ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext + aese v2.16b, v27.16b //AES block 8k+10 - round 13 + aese v4.16b, v27.16b //AES block 8k+12 - round 13 + + rev32 v23.16b, v30.16b //CTR block 8k+18 + add v30.4s, v30.4s, v31.4s //CTR block 8k+18 + aese v5.16b, v27.16b //AES block 8k+13 - round 13 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 12 + aese v3.16b, v27.16b //AES block 8k+11 - round 13 + cmp x0, x5 //.LOOP CONTROL + +.inst 0xce02714a //eor3 v10.16b, v10.16b, v2.16b, v28.16b //AES block 8k+10 - result + rev32 v25.16b, v30.16b //CTR block 8k+19 + add v30.4s, v30.4s, v31.4s //CTR block 8k+19 + + aese v0.16b, v27.16b //AES block 8k+8 - round 13 + aese v6.16b, v27.16b //AES block 8k+14 - round 13 +.inst 0xce0571ad //eor3 v13.16b, v13.16b, v5.16b, v28.16b //AES block 5 - result + + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + aese v1.16b, v27.16b //AES block 8k+9 - round 13 + +.inst 0xce04718c //eor3 v12.16b, v12.16b, v4.16b, v28.16b //AES block 4 - result + rev32 v4.16b, v30.16b //CTR block 8k+20 +.inst 0xce03716b //eor3 v11.16b, v11.16b, v3.16b, v28.16b //AES block 8k+11 - result + + mov v3.16b, v25.16b //CTR block 8k+19 +.inst 0xce017129 //eor3 v9.16b, v9.16b, v1.16b, v28.16b //AES block 8k+9 - result +.inst 0xce007108 //eor3 v8.16b, v8.16b, v0.16b, v28.16b //AES block 8k+8 - result + + add v30.4s, v30.4s, v31.4s //CTR block 8k+20 + stp q8, q9, [x2], #32 //AES block 8k+8, 8k+9 - store result + mov v2.16b, v23.16b //CTR block 8k+18 + +.inst 0xce0771ef //eor3 v15.16b, v15.16b, v7.16b, v28.16b //AES block 7 - result +.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low + stp q10, q11, [x2], #32 //AES block 8k+10, 8k+11 - store result + +.inst 0xce0671ce //eor3 v14.16b, v14.16b, v6.16b, v28.16b //AES block 6 - result + mov v1.16b, v22.16b //CTR block 8k+17 + stp q12, q13, [x2], #32 //AES block 4, 5 - store result + + stp q14, q15, [x2], #32 //AES block 6, 7 - store result + mov v0.16b, v20.16b //CTR block 8k+16 + b.lt .L256_enc_main_loop + +.L256_enc_prepretail: //PREPRETAIL + rev32 v5.16b, v30.16b //CTR block 8k+13 + ldp q26, q27, [x8, #0] //load rk0, rk1 + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + + rev64 v10.16b, v10.16b //GHASH block 8k+2 + + rev32 v6.16b, v30.16b //CTR block 8k+14 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + + rev64 v13.16b, v13.16b //GHASH block 8k+5 + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + + rev32 v7.16b, v30.16b //CTR block 8k+15 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + rev64 v8.16b, v8.16b //GHASH block 8k + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + + rev64 v9.16b, v9.16b //GHASH block 8k+1 + ldp q28, q26, [x8, #32] //load rk2, rk3 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + eor v8.16b, v8.16b, v19.16b //PRE 1 + + rev64 v11.16b, v11.16b //GHASH block 8k+3 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + + ldp q27, q28, [x8, #64] //load rk4, rk5 + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + + rev64 v14.16b, v14.16b //GHASH block 8k+6 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + + rev64 v12.16b, v12.16b //GHASH block 8k+4 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + ldp q26, q27, [x8, #96] //load rk6, rk7 + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + rev64 v15.16b, v15.16b //GHASH block 8k+7 + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + + ldp q28, q26, [x8, #128] //load rk8, rk9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 + + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 + + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + + ldp q27, q28, [x8, #160] //load rk10, rk11 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 + +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + ldr d16, [x10] //MODULO - load modulo constant + +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 + + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 11 + + ldp q26, q27, [x8, #192] //load rk12, rk13 + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 11 + +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 11 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 11 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 11 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 11 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 11 + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 11 + ldr q28, [x8, #224] //load rk14 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 12 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 12 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 12 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 12 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 12 + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 12 + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 12 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 12 + aese v0.16b, v27.16b //AES block 8k+8 - round 13 + +.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low + aese v5.16b, v27.16b //AES block 8k+13 - round 13 + aese v1.16b, v27.16b //AES block 8k+9 - round 13 + + aese v3.16b, v27.16b //AES block 8k+11 - round 13 + aese v4.16b, v27.16b //AES block 8k+12 - round 13 + aese v7.16b, v27.16b //AES block 8k+15 - round 13 + + aese v2.16b, v27.16b //AES block 8k+10 - round 13 + aese v6.16b, v27.16b //AES block 8k+14 - round 13 +.L256_enc_tail: //TAIL + + ldp q24, q25, [x3, #192] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + + ldr q8, [x0], #16 //AES block 8k+8 - load plaintext + + ldp q20, q21, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + + ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag + ldp q22, q23, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + ext v23.16b, v23.16b, v23.16b, #8 + mov v29.16b, v28.16b + + cmp x5, #112 +.inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result + b.gt .L256_enc_blocks_more_than_7 + + movi v19.8b, #0 + mov v7.16b, v6.16b + movi v17.8b, #0 + + mov v6.16b, v5.16b + mov v5.16b, v4.16b + mov v4.16b, v3.16b + + mov v3.16b, v2.16b + sub v30.4s, v30.4s, v31.4s + mov v2.16b, v1.16b + + movi v18.8b, #0 + cmp x5, #96 + b.gt .L256_enc_blocks_more_than_6 + + mov v7.16b, v6.16b + mov v6.16b, v5.16b + cmp x5, #80 + + mov v5.16b, v4.16b + mov v4.16b, v3.16b + mov v3.16b, v1.16b + + sub v30.4s, v30.4s, v31.4s + b.gt .L256_enc_blocks_more_than_5 + + mov v7.16b, v6.16b + sub v30.4s, v30.4s, v31.4s + + mov v6.16b, v5.16b + mov v5.16b, v4.16b + + cmp x5, #64 + mov v4.16b, v1.16b + b.gt .L256_enc_blocks_more_than_4 + + cmp x5, #48 + mov v7.16b, v6.16b + mov v6.16b, v5.16b + + mov v5.16b, v1.16b + sub v30.4s, v30.4s, v31.4s + b.gt .L256_enc_blocks_more_than_3 + + cmp x5, #32 + mov v7.16b, v6.16b + ldr q24, [x3, #96] //load h4k | h3k + + mov v6.16b, v1.16b + sub v30.4s, v30.4s, v31.4s + b.gt .L256_enc_blocks_more_than_2 + + mov v7.16b, v1.16b + + sub v30.4s, v30.4s, v31.4s + cmp x5, #16 + b.gt .L256_enc_blocks_more_than_1 + + sub v30.4s, v30.4s, v31.4s + ldr q21, [x3, #48] //load h2k | h1k + b .L256_enc_blocks_less_than_1 +.L256_enc_blocks_more_than_7: //blocks left > 7 + st1 { v9.16b}, [x2], #16 //AES final-7 block - store result + + rev64 v8.16b, v9.16b //GHASH final-7 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ldr q9, [x0], #16 //AES final-6 block - load plaintext + + pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high + ins v27.d[0], v8.d[1] //GHASH final-7 block - mid + ins v18.d[0], v24.d[1] //GHASH final-7 block - mid + + movi v16.8b, #0 //suppress further partial tag feed in + + eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid +.inst 0xce017529 //eor3 v9.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result + + pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid + pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low +.L256_enc_blocks_more_than_6: //blocks left > 6 + + st1 { v9.16b}, [x2], #16 //AES final-6 block - store result + + rev64 v8.16b, v9.16b //GHASH final-6 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low + ins v27.d[0], v8.d[1] //GHASH final-6 block - mid + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high + + ldr q9, [x0], #16 //AES final-5 block - load plaintext + + eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low + + eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid + + pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid +.inst 0xce027529 //eor3 v9.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result + + movi v16.8b, #0 //suppress further partial tag feed in + + eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high +.L256_enc_blocks_more_than_5: //blocks left > 5 + + st1 { v9.16b}, [x2], #16 //AES final-5 block - store result + + rev64 v8.16b, v9.16b //GHASH final-5 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-5 block - mid + + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high + + eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high + eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid + + ins v27.d[1], v27.d[0] //GHASH final-5 block - mid + + ldr q9, [x0], #16 //AES final-4 block - load plaintext + pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low + + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid + movi v16.8b, #0 //suppress further partial tag feed in + eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid +.inst 0xce037529 //eor3 v9.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result +.L256_enc_blocks_more_than_4: //blocks left > 4 + + st1 { v9.16b}, [x2], #16 //AES final-4 block - store result + + rev64 v8.16b, v9.16b //GHASH final-4 block + + ldr q9, [x0], #16 //AES final-3 block - load plaintext + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-4 block - mid + pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high + +.inst 0xce047529 //eor3 v9.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result + pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low + + eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid + eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low + + pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid + + movi v16.8b, #0 //suppress further partial tag feed in + + eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high +.L256_enc_blocks_more_than_3: //blocks left > 3 + + st1 { v9.16b}, [x2], #16 //AES final-3 block - store result + + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + rev64 v8.16b, v9.16b //GHASH final-3 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-3 block - mid + pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high + + eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high + eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid + ldr q24, [x3, #96] //load h4k | h3k + + ins v27.d[1], v27.d[0] //GHASH final-3 block - mid + ldr q9, [x0], #16 //AES final-2 block - load plaintext + + pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid + pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low + +.inst 0xce057529 //eor3 v9.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result + movi v16.8b, #0 //suppress further partial tag feed in + + eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid + eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low +.L256_enc_blocks_more_than_2: //blocks left > 2 + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + + st1 { v9.16b}, [x2], #16 //AES final-2 block - store result + + rev64 v8.16b, v9.16b //GHASH final-2 block + ldr q9, [x0], #16 //AES final-1 block - load plaintext + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-2 block - mid + + movi v16.8b, #0 //suppress further partial tag feed in + + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high +.inst 0xce067529 //eor3 v9.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result + + eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid + + eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high + + pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid + pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid + eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low +.L256_enc_blocks_more_than_1: //blocks left > 1 + + st1 { v9.16b}, [x2], #16 //AES final-1 block - store result + + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + rev64 v8.16b, v9.16b //GHASH final-1 block + ldr q9, [x0], #16 //AES final block - load plaintext + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + movi v16.8b, #0 //suppress further partial tag feed in + + ins v27.d[0], v8.d[1] //GHASH final-1 block - mid + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high + +.inst 0xce077529 //eor3 v9.16b, v9.16b, v7.16b, v29.16b //AES final block - result + eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high + + pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low + eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid + + ldr q21, [x3, #48] //load h2k | h1k + + eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low + ins v27.d[1], v27.d[0] //GHASH final-1 block - mid + + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid + + eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid +.L256_enc_blocks_less_than_1: //blocks left <= 1 + + and x1, x1, #127 //bit_length %= 128 + + sub x1, x1, #128 //bit_length -= 128 + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + + mvn x6, xzr //temp0_x = 0xffffffffffffffff + and x1, x1, #127 //bit_length %= 128 + + lsr x6, x6, x1 //temp0_x is mask for top 64b of last block + cmp x1, #64 + mvn x7, xzr //temp1_x = 0xffffffffffffffff + + csel x14, x6, xzr, lt + csel x13, x7, x6, lt + + mov v0.d[0], x13 //ctr0b is mask for last block + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + + ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored + mov v0.d[1], x14 + + and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits + + rev64 v8.16b, v9.16b //GHASH final block + + rev32 v30.16b, v30.16b + bif v9.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing + str q30, [x16] //store the updated counter + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + st1 { v9.16b}, [x2] //store all 16B + + ins v16.d[0], v8.d[1] //GHASH final block - mid + pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high + pmull v26.1q, v8.1d, v20.1d //GHASH final block - low + + eor v17.16b, v17.16b, v28.16b //GHASH final block - high + eor v19.16b, v19.16b, v26.16b //GHASH final block - low + + eor v16.8b, v16.8b, v8.8b //GHASH final block - mid + + pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid + + eor v18.16b, v18.16b, v16.16b //GHASH final block - mid + ldr d16, [x10] //MODULO - load modulo constant + + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + +.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + st1 { v19.16b }, [x3] + mov x0, x9 //return sizes + + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + ldp d8, d9, [sp], #80 + ret + +.L256_enc_ret: + mov w0, #0x0 + ret +.size unroll8_eor3_aes_gcm_enc_256_kernel,.-unroll8_eor3_aes_gcm_enc_256_kernel +.globl unroll8_eor3_aes_gcm_dec_256_kernel +.type unroll8_eor3_aes_gcm_dec_256_kernel,%function +.align 4 +unroll8_eor3_aes_gcm_dec_256_kernel: + AARCH64_VALID_CALL_TARGET + cbz x1, .L256_dec_ret + stp d8, d9, [sp, #-80]! + lsr x9, x1, #3 + mov x16, x4 + mov x8, x5 + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + mov x5, #0xc200000000000000 + stp x5, xzr, [sp, #64] + add x10, sp, #64 + + ld1 { v0.16b}, [x16] //CTR block 0 + + mov x15, #0x100000000 //set up counter increment + movi v31.16b, #0x0 + mov v31.d[1], x15 + mov x5, x9 + + sub x5, x5, #1 //byte_len - 1 + + rev32 v30.16b, v0.16b //set up reversed counter + + add v30.4s, v30.4s, v31.4s //CTR block 0 + + rev32 v1.16b, v30.16b //CTR block 1 + add v30.4s, v30.4s, v31.4s //CTR block 1 + + rev32 v2.16b, v30.16b //CTR block 2 + add v30.4s, v30.4s, v31.4s //CTR block 2 + ldp q26, q27, [x8, #0] //load rk0, rk1 + + rev32 v3.16b, v30.16b //CTR block 3 + add v30.4s, v30.4s, v31.4s //CTR block 3 + + rev32 v4.16b, v30.16b //CTR block 4 + add v30.4s, v30.4s, v31.4s //CTR block 4 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + + rev32 v5.16b, v30.16b //CTR block 5 + add v30.4s, v30.4s, v31.4s //CTR block 5 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + + rev32 v6.16b, v30.16b //CTR block 6 + add v30.4s, v30.4s, v31.4s //CTR block 6 + + rev32 v7.16b, v30.16b //CTR block 7 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 0 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 0 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 0 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 0 + ldp q28, q26, [x8, #32] //load rk2, rk3 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 1 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 1 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 1 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 1 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 2 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 2 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 2 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 2 + ldp q27, q28, [x8, #64] //load rk4, rk5 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 3 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 3 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 3 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 3 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 4 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 4 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 4 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 4 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 5 + + ldp q26, q27, [x8, #96] //load rk6, rk7 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 5 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 5 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 5 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 6 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 6 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 6 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 6 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + ldp q28, q26, [x8, #128] //load rk8, rk9 + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 7 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 7 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 7 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 7 + + and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 8 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 8 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 8 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 8 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 9 + + ld1 { v19.16b}, [x3] + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + ldp q27, q28, [x8, #160] //load rk10, rk11 + add x4, x0, x1, lsr #3 //end_input_ptr + add x5, x5, x0 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 9 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 9 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 9 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 9 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 9 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 9 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 9 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 4 - round 10 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 7 - round 10 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 5 - round 10 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 10 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 10 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 6 - round 10 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 10 + ldp q26, q27, [x8, #192] //load rk12, rk13 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 11 + add v30.4s, v30.4s, v31.4s //CTR block 7 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 7 - round 11 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 11 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 11 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 5 - round 11 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 4 - round 11 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 11 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 6 - round 11 + ldr q28, [x8, #224] //load rk14 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 12 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 4 - round 12 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 5 - round 12 + + cmp x0, x5 //check if we have <= 8 blocks + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 12 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 12 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 6 - round 12 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 12 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 7 - round 12 + + aese v5.16b, v27.16b //AES block 5 - round 13 + aese v1.16b, v27.16b //AES block 1 - round 13 + aese v2.16b, v27.16b //AES block 2 - round 13 + + aese v0.16b, v27.16b //AES block 0 - round 13 + aese v4.16b, v27.16b //AES block 4 - round 13 + aese v6.16b, v27.16b //AES block 6 - round 13 + + aese v3.16b, v27.16b //AES block 3 - round 13 + aese v7.16b, v27.16b //AES block 7 - round 13 + b.ge .L256_dec_tail //handle tail + + ldp q8, q9, [x0], #32 //AES block 0, 1 - load ciphertext + + ldp q10, q11, [x0], #32 //AES block 2, 3 - load ciphertext + + ldp q12, q13, [x0], #32 //AES block 4, 5 - load ciphertext + + ldp q14, q15, [x0], #32 //AES block 6, 7 - load ciphertext + cmp x0, x5 //check if we have <= 8 blocks + +.inst 0xce017121 //eor3 v1.16b, v9.16b, v1.16b, v28.16b //AES block 1 - result +.inst 0xce007100 //eor3 v0.16b, v8.16b, v0.16b, v28.16b //AES block 0 - result + stp q0, q1, [x2], #32 //AES block 0, 1 - store result + + rev32 v0.16b, v30.16b //CTR block 8 + add v30.4s, v30.4s, v31.4s //CTR block 8 +.inst 0xce037163 //eor3 v3.16b, v11.16b, v3.16b, v28.16b //AES block 3 - result + +.inst 0xce0571a5 //eor3 v5.16b, v13.16b, v5.16b, v28.16b //AES block 5 - result + +.inst 0xce047184 //eor3 v4.16b, v12.16b, v4.16b, v28.16b //AES block 4 - result + rev32 v1.16b, v30.16b //CTR block 9 + add v30.4s, v30.4s, v31.4s //CTR block 9 + +.inst 0xce027142 //eor3 v2.16b, v10.16b, v2.16b, v28.16b //AES block 2 - result + stp q2, q3, [x2], #32 //AES block 2, 3 - store result + + rev32 v2.16b, v30.16b //CTR block 10 + add v30.4s, v30.4s, v31.4s //CTR block 10 + +.inst 0xce0671c6 //eor3 v6.16b, v14.16b, v6.16b, v28.16b //AES block 6 - result + + rev32 v3.16b, v30.16b //CTR block 11 + add v30.4s, v30.4s, v31.4s //CTR block 11 + stp q4, q5, [x2], #32 //AES block 4, 5 - store result + +.inst 0xce0771e7 //eor3 v7.16b, v15.16b, v7.16b, v28.16b //AES block 7 - result + stp q6, q7, [x2], #32 //AES block 6, 7 - store result + + rev32 v4.16b, v30.16b //CTR block 12 + add v30.4s, v30.4s, v31.4s //CTR block 12 + b.ge .L256_dec_prepretail //do prepretail + +.L256_dec_main_loop: //main loop start + rev32 v5.16b, v30.16b //CTR block 8k+13 + ldp q26, q27, [x8, #0] //load rk0, rk1 + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + + rev64 v9.16b, v9.16b //GHASH block 8k+1 + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + + rev32 v6.16b, v30.16b //CTR block 8k+14 + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + rev64 v8.16b, v8.16b //GHASH block 8k + + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + rev64 v12.16b, v12.16b //GHASH block 8k+4 + rev64 v11.16b, v11.16b //GHASH block 8k+3 + + rev32 v7.16b, v30.16b //CTR block 8k+15 + rev64 v15.16b, v15.16b //GHASH block 8k+7 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + ldp q28, q26, [x8, #32] //load rk2, rk3 + + eor v8.16b, v8.16b, v19.16b //PRE 1 + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + rev64 v10.16b, v10.16b //GHASH block 8k+2 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + + ldp q27, q28, [x8, #64] //load rk4, rk5 + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + + ldp q26, q27, [x8, #96] //load rk6, rk7 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + rev64 v13.16b, v13.16b //GHASH block 8k+5 + + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + rev64 v14.16b, v14.16b //GHASH block 8k+6 + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + ldp q28, q26, [x8, #128] //load rk8, rk9 + + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 + + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + + ldp q27, q28, [x8, #160] //load rk10, rk11 + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 + + ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load ciphertext + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 + + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 + + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high + + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 + + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 + +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low + rev32 v20.16b, v30.16b //CTR block 8k+16 + ldr d16, [x10] //MODULO - load modulo constant + + add v30.4s, v30.4s, v31.4s //CTR block 8k+16 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 11 + ldp q26, q27, [x8, #192] //load rk12, rk13 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 11 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 11 + +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + rev32 v22.16b, v30.16b //CTR block 8k+17 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 11 + + ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load ciphertext + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 11 + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 11 + add v30.4s, v30.4s, v31.4s //CTR block 8k+17 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 11 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 12 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 12 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 12 + + rev32 v23.16b, v30.16b //CTR block 8k+18 + add v30.4s, v30.4s, v31.4s //CTR block 8k+18 + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 12 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 11 + + ldr q28, [x8, #224] //load rk14 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 12 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 12 + +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 12 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 12 + + ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load ciphertext + aese v1.16b, v27.16b //AES block 8k+9 - round 13 + aese v2.16b, v27.16b //AES block 8k+10 - round 13 + + ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load ciphertext + aese v0.16b, v27.16b //AES block 8k+8 - round 13 + aese v5.16b, v27.16b //AES block 8k+13 - round 13 + + rev32 v25.16b, v30.16b //CTR block 8k+19 +.inst 0xce027142 //eor3 v2.16b, v10.16b, v2.16b, v28.16b //AES block 8k+10 - result +.inst 0xce017121 //eor3 v1.16b, v9.16b, v1.16b, v28.16b //AES block 8k+9 - result + + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + aese v7.16b, v27.16b //AES block 8k+15 - round 13 + + add v30.4s, v30.4s, v31.4s //CTR block 8k+19 + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + aese v4.16b, v27.16b //AES block 8k+12 - round 13 + +.inst 0xce0571a5 //eor3 v5.16b, v13.16b, v5.16b, v28.16b //AES block 8k+13 - result +.inst 0xce007100 //eor3 v0.16b, v8.16b, v0.16b, v28.16b //AES block 8k+8 - result + aese v3.16b, v27.16b //AES block 8k+11 - round 13 + + stp q0, q1, [x2], #32 //AES block 8k+8, 8k+9 - store result + mov v0.16b, v20.16b //CTR block 8k+16 +.inst 0xce047184 //eor3 v4.16b, v12.16b, v4.16b, v28.16b //AES block 8k+12 - result + +.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low +.inst 0xce037163 //eor3 v3.16b, v11.16b, v3.16b, v28.16b //AES block 8k+11 - result + stp q2, q3, [x2], #32 //AES block 8k+10, 8k+11 - store result + + mov v3.16b, v25.16b //CTR block 8k+19 + mov v2.16b, v23.16b //CTR block 8k+18 + aese v6.16b, v27.16b //AES block 8k+14 - round 13 + + mov v1.16b, v22.16b //CTR block 8k+17 + stp q4, q5, [x2], #32 //AES block 8k+12, 8k+13 - store result +.inst 0xce0771e7 //eor3 v7.16b, v15.16b, v7.16b, v28.16b //AES block 8k+15 - result + +.inst 0xce0671c6 //eor3 v6.16b, v14.16b, v6.16b, v28.16b //AES block 8k+14 - result + rev32 v4.16b, v30.16b //CTR block 8k+20 + add v30.4s, v30.4s, v31.4s //CTR block 8k+20 + + cmp x0, x5 //.LOOP CONTROL + stp q6, q7, [x2], #32 //AES block 8k+14, 8k+15 - store result + b.lt .L256_dec_main_loop + +.L256_dec_prepretail: //PREPRETAIL + ldp q26, q27, [x8, #0] //load rk0, rk1 + rev32 v5.16b, v30.16b //CTR block 8k+13 + add v30.4s, v30.4s, v31.4s //CTR block 8k+13 + + rev64 v12.16b, v12.16b //GHASH block 8k+4 + ldr q21, [x3, #144] //load h6k | h5k + ldr q24, [x3, #192] //load h8k | h7k + + rev32 v6.16b, v30.16b //CTR block 8k+14 + rev64 v8.16b, v8.16b //GHASH block 8k + add v30.4s, v30.4s, v31.4s //CTR block 8k+14 + + ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 + ldr q23, [x3, #176] //load h7l | h7h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #208] //load h8l | h8h + ext v25.16b, v25.16b, v25.16b, #8 + rev64 v9.16b, v9.16b //GHASH block 8k+1 + + rev32 v7.16b, v30.16b //CTR block 8k+15 + rev64 v10.16b, v10.16b //GHASH block 8k+2 + ldr q20, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 + + ldp q28, q26, [x8, #32] //load rk2, rk3 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 + eor v8.16b, v8.16b, v19.16b //PRE 1 + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 + + pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high + trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low + + rev64 v11.16b, v11.16b //GHASH block 8k+3 + pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 + pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 + rev64 v14.16b, v14.16b //GHASH block 8k+6 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 + + pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high + trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 + + ldp q27, q28, [x8, #64] //load rk4, rk5 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 + pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 + eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high + eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid + + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 + pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 + +.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high + trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid + + pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid + pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low + eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low + + pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 + +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 + eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid + + eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 + pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 + + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 + pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 + ldp q26, q27, [x8, #96] //load rk6, rk7 + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + rev64 v15.16b, v15.16b //GHASH block 8k+7 + rev64 v13.16b, v13.16b //GHASH block 8k+5 + +.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid + + trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 + ldr q21, [x3, #48] //load h2k | h1k + ldr q24, [x3, #96] //load h4k | h3k + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 + + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 + + pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high + pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high + pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low + + trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid + pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low + trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 + pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 + + ldp q28, q26, [x8, #128] //load rk8, rk9 + pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 + + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 +.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 + trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 + eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 + eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 + pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 + pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid + pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high + + pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid + pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid + pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low + + ldp q27, q28, [x8, #160] //load rk10, rk11 +.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low +.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 + +.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high +.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low + ldr d16, [x10] //MODULO - load modulo constant + +.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid + + aese v4.16b, v27.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 + aese v6.16b, v27.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 + aese v5.16b, v27.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 + +.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + + aese v7.16b, v27.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 + ldp q26, q27, [x8, #192] //load rk12, rk13 + + ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 11 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 11 + + pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 11 + + aese v7.16b, v28.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 11 + aese v6.16b, v28.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 11 + aese v4.16b, v28.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 11 + + aese v5.16b, v28.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 11 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 8k+11 - round 12 + +.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid + + aese v3.16b, v27.16b //AES block 8k+11 - round 13 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 8k+10 - round 12 + aese v6.16b, v26.16b + aesmc v6.16b, v6.16b //AES block 8k+14 - round 12 + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + aese v4.16b, v26.16b + aesmc v4.16b, v4.16b //AES block 8k+12 - round 12 + aese v7.16b, v26.16b + aesmc v7.16b, v7.16b //AES block 8k+15 - round 12 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 8k+8 - round 12 + ldr q28, [x8, #224] //load rk14 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 8k+9 - round 12 + + aese v4.16b, v27.16b //AES block 8k+12 - round 13 + ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + aese v5.16b, v26.16b + aesmc v5.16b, v5.16b //AES block 8k+13 - round 12 + + aese v6.16b, v27.16b //AES block 8k+14 - round 13 + aese v2.16b, v27.16b //AES block 8k+10 - round 13 + aese v1.16b, v27.16b //AES block 8k+9 - round 13 + + aese v5.16b, v27.16b //AES block 8k+13 - round 13 +.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low + add v30.4s, v30.4s, v31.4s //CTR block 8k+15 + + aese v7.16b, v27.16b //AES block 8k+15 - round 13 + aese v0.16b, v27.16b //AES block 8k+8 - round 13 +.L256_dec_tail: //TAIL + + ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + cmp x5, #112 + + ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext + + ldp q24, q25, [x3, #192] //load h8k | h7k + ext v25.16b, v25.16b, v25.16b, #8 + mov v29.16b, v28.16b + + ldp q20, q21, [x3, #128] //load h5l | h5h + ext v20.16b, v20.16b, v20.16b, #8 + +.inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result + ldp q22, q23, [x3, #160] //load h6l | h6h + ext v22.16b, v22.16b, v22.16b, #8 + ext v23.16b, v23.16b, v23.16b, #8 + b.gt .L256_dec_blocks_more_than_7 + + mov v7.16b, v6.16b + sub v30.4s, v30.4s, v31.4s + mov v6.16b, v5.16b + + mov v5.16b, v4.16b + mov v4.16b, v3.16b + movi v19.8b, #0 + + movi v17.8b, #0 + movi v18.8b, #0 + mov v3.16b, v2.16b + + cmp x5, #96 + mov v2.16b, v1.16b + b.gt .L256_dec_blocks_more_than_6 + + mov v7.16b, v6.16b + mov v6.16b, v5.16b + + mov v5.16b, v4.16b + cmp x5, #80 + sub v30.4s, v30.4s, v31.4s + + mov v4.16b, v3.16b + mov v3.16b, v1.16b + b.gt .L256_dec_blocks_more_than_5 + + cmp x5, #64 + mov v7.16b, v6.16b + sub v30.4s, v30.4s, v31.4s + + mov v6.16b, v5.16b + + mov v5.16b, v4.16b + mov v4.16b, v1.16b + b.gt .L256_dec_blocks_more_than_4 + + sub v30.4s, v30.4s, v31.4s + mov v7.16b, v6.16b + cmp x5, #48 + + mov v6.16b, v5.16b + mov v5.16b, v1.16b + b.gt .L256_dec_blocks_more_than_3 + + ldr q24, [x3, #96] //load h4k | h3k + sub v30.4s, v30.4s, v31.4s + mov v7.16b, v6.16b + + cmp x5, #32 + mov v6.16b, v1.16b + b.gt .L256_dec_blocks_more_than_2 + + sub v30.4s, v30.4s, v31.4s + + mov v7.16b, v1.16b + cmp x5, #16 + b.gt .L256_dec_blocks_more_than_1 + + sub v30.4s, v30.4s, v31.4s + ldr q21, [x3, #48] //load h2k | h1k + b .L256_dec_blocks_less_than_1 +.L256_dec_blocks_more_than_7: //blocks left > 7 + rev64 v8.16b, v9.16b //GHASH final-7 block + ldr q9, [x0], #16 //AES final-6 block - load ciphertext + st1 { v12.16b}, [x2], #16 //AES final-7 block - store result + + ins v18.d[0], v24.d[1] //GHASH final-7 block - mid + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-7 block - mid +.inst 0xce01752c //eor3 v12.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result + + pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high + + eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid + movi v16.8b, #0 //suppress further partial tag feed in + + pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low + pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid +.L256_dec_blocks_more_than_6: //blocks left > 6 + + rev64 v8.16b, v9.16b //GHASH final-6 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + ldr q9, [x0], #16 //AES final-5 block - load ciphertext + movi v16.8b, #0 //suppress further partial tag feed in + + ins v27.d[0], v8.d[1] //GHASH final-6 block - mid + st1 { v12.16b}, [x2], #16 //AES final-6 block - store result + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high + + pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low + +.inst 0xce02752c //eor3 v12.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result + eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low + eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid + + pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid + + eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high +.L256_dec_blocks_more_than_5: //blocks left > 5 + + rev64 v8.16b, v9.16b //GHASH final-5 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high + ins v27.d[0], v8.d[1] //GHASH final-5 block - mid + + ldr q9, [x0], #16 //AES final-4 block - load ciphertext + + eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid + st1 { v12.16b}, [x2], #16 //AES final-5 block - store result + + pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low + ins v27.d[1], v27.d[0] //GHASH final-5 block - mid + + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid + + eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high +.inst 0xce03752c //eor3 v12.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result + eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low + + eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid + movi v16.8b, #0 //suppress further partial tag feed in +.L256_dec_blocks_more_than_4: //blocks left > 4 + + rev64 v8.16b, v9.16b //GHASH final-4 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-4 block - mid + ldr q9, [x0], #16 //AES final-3 block - load ciphertext + + movi v16.8b, #0 //suppress further partial tag feed in + + pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low + pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high + + eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid + + eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high + + pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid + + eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low + st1 { v12.16b}, [x2], #16 //AES final-4 block - store result + + eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid +.inst 0xce04752c //eor3 v12.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result +.L256_dec_blocks_more_than_3: //blocks left > 3 + + ldr q25, [x3, #112] //load h4l | h4h + ext v25.16b, v25.16b, v25.16b, #8 + rev64 v8.16b, v9.16b //GHASH final-3 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + ldr q9, [x0], #16 //AES final-2 block - load ciphertext + ldr q24, [x3, #96] //load h4k | h3k + + ins v27.d[0], v8.d[1] //GHASH final-3 block - mid + st1 { v12.16b}, [x2], #16 //AES final-3 block - store result + +.inst 0xce05752c //eor3 v12.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result + + eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid + + ins v27.d[1], v27.d[0] //GHASH final-3 block - mid + pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low + pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high + + movi v16.8b, #0 //suppress further partial tag feed in + pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid + eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low + + eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high + + eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid +.L256_dec_blocks_more_than_2: //blocks left > 2 + + rev64 v8.16b, v9.16b //GHASH final-2 block + + ldr q23, [x3, #80] //load h3l | h3h + ext v23.16b, v23.16b, v23.16b, #8 + ldr q9, [x0], #16 //AES final-1 block - load ciphertext + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-2 block - mid + + pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low + st1 { v12.16b}, [x2], #16 //AES final-2 block - store result +.inst 0xce06752c //eor3 v12.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result + + eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid + eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low + movi v16.8b, #0 //suppress further partial tag feed in + + pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid + pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high + + eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid + eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high +.L256_dec_blocks_more_than_1: //blocks left > 1 + + rev64 v8.16b, v9.16b //GHASH final-1 block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v27.d[0], v8.d[1] //GHASH final-1 block - mid + ldr q22, [x3, #64] //load h2l | h2h + ext v22.16b, v22.16b, v22.16b, #8 + + eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid + ldr q9, [x0], #16 //AES final block - load ciphertext + st1 { v12.16b}, [x2], #16 //AES final-1 block - store result + + ldr q21, [x3, #48] //load h2k | h1k + pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low + + ins v27.d[1], v27.d[0] //GHASH final-1 block - mid + + eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low + +.inst 0xce07752c //eor3 v12.16b, v9.16b, v7.16b, v29.16b //AES final block - result + pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high + + pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid + + movi v16.8b, #0 //suppress further partial tag feed in + eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high + + eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid +.L256_dec_blocks_less_than_1: //blocks left <= 1 + + ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored + mvn x6, xzr //temp0_x = 0xffffffffffffffff + and x1, x1, #127 //bit_length %= 128 + + sub x1, x1, #128 //bit_length -= 128 + rev32 v30.16b, v30.16b + str q30, [x16] //store the updated counter + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + + and x1, x1, #127 //bit_length %= 128 + + lsr x6, x6, x1 //temp0_x is mask for top 64b of last block + cmp x1, #64 + mvn x7, xzr //temp1_x = 0xffffffffffffffff + + csel x14, x6, xzr, lt + csel x13, x7, x6, lt + + mov v0.d[0], x13 //ctr0b is mask for last block + mov v0.d[1], x14 + + and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits + ldr q20, [x3, #32] //load h1l | h1h + ext v20.16b, v20.16b, v20.16b, #8 + bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing + + rev64 v8.16b, v9.16b //GHASH final block + + eor v8.16b, v8.16b, v16.16b //feed in partial tag + + ins v16.d[0], v8.d[1] //GHASH final block - mid + pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high + + eor v16.8b, v16.8b, v8.8b //GHASH final block - mid + + pmull v26.1q, v8.1d, v20.1d //GHASH final block - low + eor v17.16b, v17.16b, v28.16b //GHASH final block - high + + pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid + + eor v18.16b, v18.16b, v16.16b //GHASH final block - mid + ldr d16, [x10] //MODULO - load modulo constant + eor v19.16b, v19.16b, v26.16b //GHASH final block - low + + pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid + eor v14.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up + + ext v17.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment + st1 { v12.16b}, [x2] //store all 16B + + eor v18.16b, v18.16b, v14.16b //MODULO - karatsuba tidy up + + eor v21.16b, v17.16b, v21.16b //MODULO - fold into mid + eor v18.16b, v18.16b, v21.16b //MODULO - fold into mid + + pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low + + ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment + eor v19.16b, v19.16b, v17.16b //MODULO - fold into low + + eor v19.16b, v19.16b, v18.16b //MODULO - fold into low + ext v19.16b, v19.16b, v19.16b, #8 + rev64 v19.16b, v19.16b + st1 { v19.16b }, [x3] + mov x0, x9 + + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + ldp d8, d9, [sp], #80 + ret + +.L256_dec_ret: + mov w0, #0x0 + ret +.size unroll8_eor3_aes_gcm_dec_256_kernel,.-unroll8_eor3_aes_gcm_dec_256_kernel +.byte 65,69,83,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,65,82,77,118,56,44,32,83,80,68,88,32,66,83,68,45,51,45,67,108,97,117,115,101,32,98,121,32,60,120,105,97,111,107,97,110,103,46,113,105,97,110,64,97,114,109,46,99,111,109,62,0 +.align 2 +.align 2 +#endif diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/modes/aes-gcm-armv8_64.S b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/modes/aes-gcm-armv8_64.S index 6b0bda3ad8d5..75e29532bc13 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/modes/aes-gcm-armv8_64.S +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/modes/aes-gcm-armv8_64.S @@ -7,6 +7,7 @@ .type aes_gcm_enc_128_kernel,%function .align 4 aes_gcm_enc_128_kernel: + AARCH64_VALID_CALL_TARGET cbz x1, .L128_enc_ret stp x19, x20, [sp, #-112]! mov x16, x4 @@ -989,6 +990,7 @@ aes_gcm_enc_128_kernel: .type aes_gcm_dec_128_kernel,%function .align 4 aes_gcm_dec_128_kernel: + AARCH64_VALID_CALL_TARGET cbz x1, .L128_dec_ret stp x19, x20, [sp, #-112]! mov x16, x4 @@ -1981,6 +1983,7 @@ aes_gcm_dec_128_kernel: .type aes_gcm_enc_192_kernel,%function .align 4 aes_gcm_enc_192_kernel: + AARCH64_VALID_CALL_TARGET cbz x1, .L192_enc_ret stp x19, x20, [sp, #-112]! mov x16, x4 @@ -3038,6 +3041,7 @@ aes_gcm_enc_192_kernel: .type aes_gcm_dec_192_kernel,%function .align 4 aes_gcm_dec_192_kernel: + AARCH64_VALID_CALL_TARGET cbz x1, .L192_dec_ret stp x19, x20, [sp, #-112]! mov x16, x4 @@ -4105,6 +4109,7 @@ aes_gcm_dec_192_kernel: .type aes_gcm_enc_256_kernel,%function .align 4 aes_gcm_enc_256_kernel: + AARCH64_VALID_CALL_TARGET cbz x1, .L256_enc_ret stp x19, x20, [sp, #-112]! mov x16, x4 @@ -5229,6 +5234,7 @@ aes_gcm_enc_256_kernel: .type aes_gcm_dec_256_kernel,%function .align 4 aes_gcm_dec_256_kernel: + AARCH64_VALID_CALL_TARGET cbz x1, .L256_dec_ret stp x19, x20, [sp, #-112]! mov x16, x4 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/modes/ghashv8-armx.S b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/modes/ghashv8-armx.S index cf2eadb62953..955379036110 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/modes/ghashv8-armx.S +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/modes/ghashv8-armx.S @@ -7,6 +7,7 @@ .type gcm_init_v8,%function .align 4 gcm_init_v8: + AARCH64_VALID_CALL_TARGET ld1 {v17.2d},[x1] //load input H movi v19.16b,#0xe1 shl v19.2d,v19.2d,#57 //0xc2.0 @@ -82,21 +83,110 @@ gcm_init_v8: pmull v5.1q,v5.1d,v19.1d eor v18.16b,v18.16b,v2.16b eor v4.16b,v4.16b,v7.16b - eor v20.16b, v0.16b,v18.16b //H^3 - eor v22.16b,v5.16b,v4.16b //H^4 + eor v23.16b, v0.16b,v18.16b //H^3 + eor v25.16b,v5.16b,v4.16b //H^4 + + ext v16.16b,v23.16b, v23.16b,#8 //Karatsuba pre-processing + ext v17.16b,v25.16b,v25.16b,#8 + ext v18.16b,v22.16b,v22.16b,#8 + eor v16.16b,v16.16b,v23.16b + eor v17.16b,v17.16b,v25.16b + eor v18.16b,v18.16b,v22.16b + ext v24.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v23.2d,v24.2d,v25.2d},[x0],#48 //store Htable[3..5] + + //calculate H^5 and H^6 + pmull v0.1q,v22.1d, v23.1d + pmull v5.1q,v23.1d,v23.1d + pmull2 v2.1q,v22.2d, v23.2d + pmull2 v7.1q,v23.2d,v23.2d + pmull v1.1q,v16.1d,v18.1d + pmull v6.1q,v16.1d,v16.1d - ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing - ext v17.16b,v22.16b,v22.16b,#8 - eor v16.16b,v16.16b,v20.16b - eor v17.16b,v17.16b,v22.16b - ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed - st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] + ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + ext v17.16b,v5.16b,v7.16b,#8 + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v16.16b + eor v4.16b,v5.16b,v7.16b + eor v6.16b,v6.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + eor v6.16b,v6.16b,v4.16b + pmull v4.1q,v5.1d,v19.1d + + ins v2.d[0],v1.d[1] + ins v7.d[0],v6.d[1] + ins v1.d[1],v0.d[0] + ins v6.d[1],v5.d[0] + eor v0.16b,v1.16b,v18.16b + eor v5.16b,v6.16b,v4.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + ext v4.16b,v5.16b,v5.16b,#8 + pmull v0.1q,v0.1d,v19.1d + pmull v5.1q,v5.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v4.16b,v4.16b,v7.16b + eor v26.16b,v0.16b,v18.16b //H^5 + eor v28.16b,v5.16b,v4.16b //H^6 + + ext v16.16b,v26.16b, v26.16b,#8 //Karatsuba pre-processing + ext v17.16b,v28.16b,v28.16b,#8 + ext v18.16b,v22.16b,v22.16b,#8 + eor v16.16b,v16.16b,v26.16b + eor v17.16b,v17.16b,v28.16b + eor v18.16b,v18.16b,v22.16b + ext v27.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v26.2d,v27.2d,v28.2d},[x0],#48 //store Htable[6..8] + + //calculate H^7 and H^8 + pmull v0.1q,v22.1d,v26.1d + pmull v5.1q,v22.1d,v28.1d + pmull2 v2.1q,v22.2d,v26.2d + pmull2 v7.1q,v22.2d,v28.2d + pmull v1.1q,v16.1d,v18.1d + pmull v6.1q,v17.1d,v18.1d + + ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + ext v17.16b,v5.16b,v7.16b,#8 + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v16.16b + eor v4.16b,v5.16b,v7.16b + eor v6.16b,v6.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + eor v6.16b,v6.16b,v4.16b + pmull v4.1q,v5.1d,v19.1d + + ins v2.d[0],v1.d[1] + ins v7.d[0],v6.d[1] + ins v1.d[1],v0.d[0] + ins v6.d[1],v5.d[0] + eor v0.16b,v1.16b,v18.16b + eor v5.16b,v6.16b,v4.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + ext v4.16b,v5.16b,v5.16b,#8 + pmull v0.1q,v0.1d,v19.1d + pmull v5.1q,v5.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v4.16b,v4.16b,v7.16b + eor v29.16b,v0.16b,v18.16b //H^7 + eor v31.16b,v5.16b,v4.16b //H^8 + + ext v16.16b,v29.16b,v29.16b,#8 //Karatsuba pre-processing + ext v17.16b,v31.16b,v31.16b,#8 + eor v16.16b,v16.16b,v29.16b + eor v17.16b,v17.16b,v31.16b + ext v30.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v29.2d,v30.2d,v31.2d},[x0] //store Htable[9..11] ret .size gcm_init_v8,.-gcm_init_v8 .globl gcm_gmult_v8 .type gcm_gmult_v8,%function .align 4 gcm_gmult_v8: + AARCH64_VALID_CALL_TARGET ld1 {v17.2d},[x0] //load Xi movi v19.16b,#0xe1 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... @@ -138,6 +228,7 @@ gcm_gmult_v8: .type gcm_ghash_v8,%function .align 4 gcm_ghash_v8: + AARCH64_VALID_CALL_TARGET cmp x3,#64 b.hs .Lgcm_ghash_v8_4x ld1 {v0.2d},[x0] //load [rotated] Xi diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/sha/keccak1600-armv8.S b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/sha/keccak1600-armv8.S index 67e553f97634..a6f0f603a092 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/sha/keccak1600-armv8.S +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/sha/keccak1600-armv8.S @@ -1,3 +1,5 @@ +#include "arm_arch.h" + .text .align 8 // strategic alignment and padding that allows to use @@ -33,8 +35,8 @@ iotas: .type KeccakF1600_int,%function .align 5 KeccakF1600_int: + AARCH64_SIGN_LINK_REGISTER adr x28,iotas -.inst 0xd503233f // paciasp stp x28,x30,[sp,#16] // 32 bytes on top are mine b .Loop .align 4 @@ -198,14 +200,14 @@ KeccakF1600_int: bne .Loop ldr x30,[sp,#24] -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size KeccakF1600_int,.-KeccakF1600_int .type KeccakF1600,%function .align 5 KeccakF1600: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -255,7 +257,7 @@ KeccakF1600: ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#128 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size KeccakF1600,.-KeccakF1600 @@ -263,7 +265,7 @@ KeccakF1600: .type SHA3_absorb,%function .align 5 SHA3_absorb: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -497,14 +499,14 @@ SHA3_absorb: ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#128 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size SHA3_absorb,.-SHA3_absorb .globl SHA3_squeeze .type SHA3_squeeze,%function .align 5 SHA3_squeeze: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-48]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -514,6 +516,8 @@ SHA3_squeeze: mov x20,x1 mov x21,x2 mov x22,x3 + cmp w4, #0 // w4 = 'next' argument + bne .Lnext_block .Loop_squeeze: ldr x4,[x0],#8 @@ -528,7 +532,7 @@ SHA3_squeeze: subs x3,x3,#8 bhi .Loop_squeeze - +.Lnext_block: mov x0,x19 bl KeccakF1600 mov x0,x19 @@ -567,7 +571,7 @@ SHA3_squeeze: ldp x19,x20,[sp,#16] ldp x21,x22,[sp,#32] ldp x29,x30,[sp],#48 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size SHA3_squeeze,.-SHA3_squeeze .type KeccakF1600_ce,%function @@ -675,7 +679,7 @@ KeccakF1600_ce: .type KeccakF1600_cext,%function .align 5 KeccakF1600_cext: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-80]! add x29,sp,#0 stp d8,d9,[sp,#16] // per ABI requirement @@ -716,14 +720,14 @@ KeccakF1600_cext: ldp d12,d13,[sp,#48] ldp d14,d15,[sp,#64] ldr x29,[sp],#80 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size KeccakF1600_cext,.-KeccakF1600_cext .globl SHA3_absorb_cext .type SHA3_absorb_cext,%function .align 5 SHA3_absorb_cext: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-80]! add x29,sp,#0 stp d8,d9,[sp,#16] // per ABI requirement @@ -939,14 +943,14 @@ SHA3_absorb_cext: ldp d12,d13,[sp,#48] ldp d14,d15,[sp,#64] ldp x29,x30,[sp],#80 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size SHA3_absorb_cext,.-SHA3_absorb_cext .globl SHA3_squeeze_cext .type SHA3_squeeze_cext,%function .align 5 SHA3_squeeze_cext: -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 mov x9,x0 @@ -1002,7 +1006,7 @@ SHA3_squeeze_cext: .Lsqueeze_done_ce: ldr x29,[sp],#16 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size SHA3_squeeze_cext,.-SHA3_squeeze_cext .byte 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/sha/sha1-armv8.S b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/sha/sha1-armv8.S index 7f6d5be9537f..42fc0a74c1c4 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/sha/sha1-armv8.S +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/sha/sha1-armv8.S @@ -1,5 +1,5 @@ +#include "arm_arch.h" #ifndef __KERNEL__ -# include "arm_arch.h" .hidden OPENSSL_armcap_P #endif @@ -10,11 +10,13 @@ .type sha1_block_data_order,%function .align 6 sha1_block_data_order: + AARCH64_VALID_CALL_TARGET adrp x16,OPENSSL_armcap_P ldr w16,[x16,#:lo12:OPENSSL_armcap_P] tst w16,#ARMV8_SHA1 b.ne .Lv8_entry + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-96]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -1074,6 +1076,7 @@ sha1_block_data_order: .align 6 sha1_block_armv8: .Lv8_entry: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-16]! add x29,sp,#0 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/sha/sha256-armv8.S b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/sha/sha256-armv8.S index 92a596403a1b..9d397fae8fb8 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/sha/sha256-armv8.S +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/sha/sha256-armv8.S @@ -4,8 +4,59 @@ // this file except in compliance with the License. You can obtain a copy // in the file LICENSE in the source distribution or at // https://www.openssl.org/source/license.html + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. The module is, however, dual licensed under OpenSSL and +// CRYPTOGAMS licenses depending on where you obtain it. For further +// details see http://www.openssl.org/~appro/cryptogams/. +// +// Permission to use under GPLv2 terms is granted. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// Kryo 1.92 17.4 (+30%) 11.2 (+8%) +// ThunderX2 2.54 13.2 (+40%) 8.40 (+18%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significantly faster +// and the gap is only 40-90%. +// +// October 2016. +// +// Originally it was reckoned that it makes no sense to implement NEON +// version of SHA256 for 64-bit processors. This is because performance +// improvement on most wide-spread Cortex-A5x processors was observed +// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was +// observed that 32-bit NEON SHA256 performs significantly better than +// 64-bit scalar version on *some* of the more recent processors. As +// result 64-bit NEON version of SHA256 was added to provide best +// all-round performance. For example it executes ~30% faster on X-Gene +// and Mongoose. [For reference, NEON version of SHA512 is bound to +// deliver much less improvement, likely *negative* on Cortex-A5x. +// Which is why NEON support is limited to SHA256.] + +// $output is the last argument if it looks like a file (it has an extension) +// $flavour is the first argument if it doesn't look like a file +#include "arm_arch.h" #ifndef __KERNEL__ -# include "arm_arch.h" .hidden OPENSSL_armcap_P #endif @@ -16,6 +67,7 @@ .type sha256_block_data_order,%function .align 6 sha256_block_data_order: + AARCH64_VALID_CALL_TARGET #ifndef __KERNEL__ adrp x16,OPENSSL_armcap_P ldr w16,[x16,#:lo12:OPENSSL_armcap_P] @@ -24,7 +76,7 @@ sha256_block_data_order: tst w16,#ARMV7_NEON b.ne .Lneon_entry #endif -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 @@ -984,7 +1036,7 @@ sha256_block_data_order: ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#128 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size sha256_block_data_order,.-sha256_block_data_order @@ -1017,6 +1069,7 @@ sha256_block_data_order: .align 6 sha256_block_armv8: .Lv8_entry: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -1158,7 +1211,9 @@ sha256_block_armv8: .type sha256_block_neon,%function .align 4 sha256_block_neon: + AARCH64_VALID_CALL_TARGET .Lneon_entry: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later stp x29, x30, [sp, #-16]! mov x29, sp sub sp,sp,#16*4 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/sha/sha512-armv8.S b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/sha/sha512-armv8.S index 9c2cdfe79ef0..dbc688df71cb 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/sha/sha512-armv8.S +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/sha/sha512-armv8.S @@ -4,8 +4,59 @@ // this file except in compliance with the License. You can obtain a copy // in the file LICENSE in the source distribution or at // https://www.openssl.org/source/license.html + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. The module is, however, dual licensed under OpenSSL and +// CRYPTOGAMS licenses depending on where you obtain it. For further +// details see http://www.openssl.org/~appro/cryptogams/. +// +// Permission to use under GPLv2 terms is granted. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// Kryo 1.92 17.4 (+30%) 11.2 (+8%) +// ThunderX2 2.54 13.2 (+40%) 8.40 (+18%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significantly faster +// and the gap is only 40-90%. +// +// October 2016. +// +// Originally it was reckoned that it makes no sense to implement NEON +// version of SHA256 for 64-bit processors. This is because performance +// improvement on most wide-spread Cortex-A5x processors was observed +// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was +// observed that 32-bit NEON SHA256 performs significantly better than +// 64-bit scalar version on *some* of the more recent processors. As +// result 64-bit NEON version of SHA256 was added to provide best +// all-round performance. For example it executes ~30% faster on X-Gene +// and Mongoose. [For reference, NEON version of SHA512 is bound to +// deliver much less improvement, likely *negative* on Cortex-A5x. +// Which is why NEON support is limited to SHA256.] + +// $output is the last argument if it looks like a file (it has an extension) +// $flavour is the first argument if it doesn't look like a file +#include "arm_arch.h" #ifndef __KERNEL__ -# include "arm_arch.h" .hidden OPENSSL_armcap_P #endif @@ -16,13 +67,14 @@ .type sha512_block_data_order,%function .align 6 sha512_block_data_order: + AARCH64_VALID_CALL_TARGET #ifndef __KERNEL__ adrp x16,OPENSSL_armcap_P ldr w16,[x16,#:lo12:OPENSSL_armcap_P] tst w16,#ARMV8_SHA512 b.ne .Lv8_entry #endif -.inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 @@ -982,7 +1034,7 @@ sha512_block_data_order: ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#128 -.inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size sha512_block_data_order,.-sha512_block_data_order @@ -1039,6 +1091,7 @@ sha512_block_data_order: .align 6 sha512_block_armv8: .Lv8_entry: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later stp x29,x30,[sp,#-16]! add x29,sp,#0 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/sm3/sm3-armv8.S b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/sm3/sm3-armv8.S new file mode 100644 index 000000000000..f1678ff1439b --- /dev/null +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/AARCH64-GCC/crypto/sm3/sm3-armv8.S @@ -0,0 +1,503 @@ +// Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// This module implements support for Armv8 SM3 instructions + +// $output is the last argument if it looks like a file (it has an extension) +// $flavour is the first argument if it doesn't look like a file +#include "arm_arch.h" +.text +.globl ossl_hwsm3_block_data_order +.type ossl_hwsm3_block_data_order,%function +.align 5 +ossl_hwsm3_block_data_order: + AARCH64_VALID_CALL_TARGET + // load state + ld1 {v5.4s,v6.4s}, [x0] + rev64 v5.4s, v5.4s + rev64 v6.4s, v6.4s + ext v5.16b, v5.16b, v5.16b, #8 + ext v6.16b, v6.16b, v6.16b, #8 + + adr x8, .Tj + ldp s16, s17, [x8] + +.Loop: + // load input + ld1 {v0.16b,v1.16b,v2.16b,v3.16b}, [x1], #64 + sub w2, w2, #1 + + mov v18.16b, v5.16b + mov v19.16b, v6.16b + +#ifndef __ARMEB__ + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + rev32 v2.16b, v2.16b + rev32 v3.16b, v3.16b +#endif + + ext v20.16b, v16.16b, v16.16b, #4 + // s4 = w7 | w8 | w9 | w10 + ext v4.16b, v1.16b, v2.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v0.16b, v1.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v2.16b, v3.16b, #8 +.inst 0xce63c004 //sm3partw1 v4.4s, v0.4s, v3.4s +.inst 0xce76c6e4 //sm3partw2 v4.4s, v23.4s, v22.4s + eor v22.16b, v0.16b, v1.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5682e5 //sm3tt1a v5.4s, v23.4s, v22.4s[0] +.inst 0xce408ae6 //sm3tt2a v6.4s, v23.4s, v0.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5692e5 //sm3tt1a v5.4s, v23.4s, v22.4s[1] +.inst 0xce409ae6 //sm3tt2a v6.4s, v23.4s, v0.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[2] +.inst 0xce40aae6 //sm3tt2a v6.4s, v23.4s, v0.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[3] +.inst 0xce40bae6 //sm3tt2a v6.4s, v23.4s, v0.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v0.16b, v2.16b, v3.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v1.16b, v2.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v3.16b, v4.16b, #8 +.inst 0xce64c020 //sm3partw1 v0.4s, v1.4s, v4.4s +.inst 0xce76c6e0 //sm3partw2 v0.4s, v23.4s, v22.4s + eor v22.16b, v1.16b, v2.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5682e5 //sm3tt1a v5.4s, v23.4s, v22.4s[0] +.inst 0xce418ae6 //sm3tt2a v6.4s, v23.4s, v1.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5692e5 //sm3tt1a v5.4s, v23.4s, v22.4s[1] +.inst 0xce419ae6 //sm3tt2a v6.4s, v23.4s, v1.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[2] +.inst 0xce41aae6 //sm3tt2a v6.4s, v23.4s, v1.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[3] +.inst 0xce41bae6 //sm3tt2a v6.4s, v23.4s, v1.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v1.16b, v3.16b, v4.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v2.16b, v3.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v4.16b, v0.16b, #8 +.inst 0xce60c041 //sm3partw1 v1.4s, v2.4s, v0.4s +.inst 0xce76c6e1 //sm3partw2 v1.4s, v23.4s, v22.4s + eor v22.16b, v2.16b, v3.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5682e5 //sm3tt1a v5.4s, v23.4s, v22.4s[0] +.inst 0xce428ae6 //sm3tt2a v6.4s, v23.4s, v2.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5692e5 //sm3tt1a v5.4s, v23.4s, v22.4s[1] +.inst 0xce429ae6 //sm3tt2a v6.4s, v23.4s, v2.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[2] +.inst 0xce42aae6 //sm3tt2a v6.4s, v23.4s, v2.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[3] +.inst 0xce42bae6 //sm3tt2a v6.4s, v23.4s, v2.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v2.16b, v4.16b, v0.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v3.16b, v4.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v0.16b, v1.16b, #8 +.inst 0xce61c062 //sm3partw1 v2.4s, v3.4s, v1.4s +.inst 0xce76c6e2 //sm3partw2 v2.4s, v23.4s, v22.4s + eor v22.16b, v3.16b, v4.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5682e5 //sm3tt1a v5.4s, v23.4s, v22.4s[0] +.inst 0xce438ae6 //sm3tt2a v6.4s, v23.4s, v3.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5692e5 //sm3tt1a v5.4s, v23.4s, v22.4s[1] +.inst 0xce439ae6 //sm3tt2a v6.4s, v23.4s, v3.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[2] +.inst 0xce43aae6 //sm3tt2a v6.4s, v23.4s, v3.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[3] +.inst 0xce43bae6 //sm3tt2a v6.4s, v23.4s, v3.4s[3] + ext v20.16b, v17.16b, v17.16b, #4 + // s4 = w7 | w8 | w9 | w10 + ext v3.16b, v0.16b, v1.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v4.16b, v0.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v1.16b, v2.16b, #8 +.inst 0xce62c083 //sm3partw1 v3.4s, v4.4s, v2.4s +.inst 0xce76c6e3 //sm3partw2 v3.4s, v23.4s, v22.4s + eor v22.16b, v4.16b, v0.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce448ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce449ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce44aee6 //sm3tt2b v6.4s, v23.4s, v4.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce44bee6 //sm3tt2b v6.4s, v23.4s, v4.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v4.16b, v1.16b, v2.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v0.16b, v1.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v2.16b, v3.16b, #8 +.inst 0xce63c004 //sm3partw1 v4.4s, v0.4s, v3.4s +.inst 0xce76c6e4 //sm3partw2 v4.4s, v23.4s, v22.4s + eor v22.16b, v0.16b, v1.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce408ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce409ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce40aee6 //sm3tt2b v6.4s, v23.4s, v0.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce40bee6 //sm3tt2b v6.4s, v23.4s, v0.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v0.16b, v2.16b, v3.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v1.16b, v2.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v3.16b, v4.16b, #8 +.inst 0xce64c020 //sm3partw1 v0.4s, v1.4s, v4.4s +.inst 0xce76c6e0 //sm3partw2 v0.4s, v23.4s, v22.4s + eor v22.16b, v1.16b, v2.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce418ee6 //sm3tt2b v6.4s, v23.4s, v1.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce419ee6 //sm3tt2b v6.4s, v23.4s, v1.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce41aee6 //sm3tt2b v6.4s, v23.4s, v1.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce41bee6 //sm3tt2b v6.4s, v23.4s, v1.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v1.16b, v3.16b, v4.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v2.16b, v3.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v4.16b, v0.16b, #8 +.inst 0xce60c041 //sm3partw1 v1.4s, v2.4s, v0.4s +.inst 0xce76c6e1 //sm3partw2 v1.4s, v23.4s, v22.4s + eor v22.16b, v2.16b, v3.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce428ee6 //sm3tt2b v6.4s, v23.4s, v2.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce429ee6 //sm3tt2b v6.4s, v23.4s, v2.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce42aee6 //sm3tt2b v6.4s, v23.4s, v2.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce42bee6 //sm3tt2b v6.4s, v23.4s, v2.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v2.16b, v4.16b, v0.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v3.16b, v4.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v0.16b, v1.16b, #8 +.inst 0xce61c062 //sm3partw1 v2.4s, v3.4s, v1.4s +.inst 0xce76c6e2 //sm3partw2 v2.4s, v23.4s, v22.4s + eor v22.16b, v3.16b, v4.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce438ee6 //sm3tt2b v6.4s, v23.4s, v3.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce439ee6 //sm3tt2b v6.4s, v23.4s, v3.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce43aee6 //sm3tt2b v6.4s, v23.4s, v3.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce43bee6 //sm3tt2b v6.4s, v23.4s, v3.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v3.16b, v0.16b, v1.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v4.16b, v0.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v1.16b, v2.16b, #8 +.inst 0xce62c083 //sm3partw1 v3.4s, v4.4s, v2.4s +.inst 0xce76c6e3 //sm3partw2 v3.4s, v23.4s, v22.4s + eor v22.16b, v4.16b, v0.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce448ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce449ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce44aee6 //sm3tt2b v6.4s, v23.4s, v4.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce44bee6 //sm3tt2b v6.4s, v23.4s, v4.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v4.16b, v1.16b, v2.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v0.16b, v1.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v2.16b, v3.16b, #8 +.inst 0xce63c004 //sm3partw1 v4.4s, v0.4s, v3.4s +.inst 0xce76c6e4 //sm3partw2 v4.4s, v23.4s, v22.4s + eor v22.16b, v0.16b, v1.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce408ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce409ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce40aee6 //sm3tt2b v6.4s, v23.4s, v0.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce40bee6 //sm3tt2b v6.4s, v23.4s, v0.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v0.16b, v2.16b, v3.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v1.16b, v2.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v3.16b, v4.16b, #8 +.inst 0xce64c020 //sm3partw1 v0.4s, v1.4s, v4.4s +.inst 0xce76c6e0 //sm3partw2 v0.4s, v23.4s, v22.4s + eor v22.16b, v1.16b, v2.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce418ee6 //sm3tt2b v6.4s, v23.4s, v1.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce419ee6 //sm3tt2b v6.4s, v23.4s, v1.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce41aee6 //sm3tt2b v6.4s, v23.4s, v1.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce41bee6 //sm3tt2b v6.4s, v23.4s, v1.4s[3] + // s4 = w7 | w8 | w9 | w10 + ext v1.16b, v3.16b, v4.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext v22.16b, v2.16b, v3.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext v23.16b, v4.16b, v0.16b, #8 +.inst 0xce60c041 //sm3partw1 v1.4s, v2.4s, v0.4s +.inst 0xce76c6e1 //sm3partw2 v1.4s, v23.4s, v22.4s + eor v22.16b, v2.16b, v3.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce428ee6 //sm3tt2b v6.4s, v23.4s, v2.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce429ee6 //sm3tt2b v6.4s, v23.4s, v2.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce42aee6 //sm3tt2b v6.4s, v23.4s, v2.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce42bee6 //sm3tt2b v6.4s, v23.4s, v2.4s[3] + eor v22.16b, v3.16b, v4.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce438ee6 //sm3tt2b v6.4s, v23.4s, v3.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce439ee6 //sm3tt2b v6.4s, v23.4s, v3.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce43aee6 //sm3tt2b v6.4s, v23.4s, v3.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce43bee6 //sm3tt2b v6.4s, v23.4s, v3.4s[3] + eor v22.16b, v4.16b, v0.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce448ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce449ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce44aee6 //sm3tt2b v6.4s, v23.4s, v4.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce44bee6 //sm3tt2b v6.4s, v23.4s, v4.4s[3] + eor v22.16b, v0.16b, v1.16b +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] +.inst 0xce408ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[0] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] +.inst 0xce409ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[1] +.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 +.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] +.inst 0xce40aee6 //sm3tt2b v6.4s, v23.4s, v0.4s[2] +.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 +.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] +.inst 0xce40bee6 //sm3tt2b v6.4s, v23.4s, v0.4s[3] + eor v5.16b, v5.16b, v18.16b + eor v6.16b, v6.16b, v19.16b + + // any remained blocks? + cbnz w2, .Loop + + // save state + rev64 v5.4s, v5.4s + rev64 v6.4s, v6.4s + ext v5.16b, v5.16b, v5.16b, #8 + ext v6.16b, v6.16b, v6.16b, #8 + st1 {v5.4s,v6.4s}, [x0] + ret +.size ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order + +.align 3 +.Tj: +.word 0x79cc4519, 0x9d8a7a87 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aes-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aes-x86_64.s index 7d2428b971ff..eef97c628c3f 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aes-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aes-x86_64.s @@ -1870,6 +1870,7 @@ AES_cbc_encrypt: .byte 0xf3,0xc3 .cfi_endproc .size AES_cbc_encrypt,.-AES_cbc_encrypt +.section .rodata .align 64 .LAES_Te: .long 0xa56363c6,0xa56363c6 @@ -2656,6 +2657,7 @@ AES_cbc_encrypt: .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 .byte 65,69,83,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 +.previous .section ".note.gnu.property", "a" .p2align 3 .long 1f - 0f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha1-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha1-x86_64.s index 68af8c69a684..6e4c90e93702 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha1-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha1-x86_64.s @@ -2718,6 +2718,7 @@ aesni_cbc_sha1_enc_avx: .byte 0xf3,0xc3 .cfi_endproc .size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx +.section .rodata .align 64 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 @@ -2729,6 +2730,7 @@ K_XX_XX: .byte 65,69,83,78,73,45,67,66,67,43,83,72,65,49,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 +.previous .type aesni_cbc_sha1_enc_shaext,@function .align 32 aesni_cbc_sha1_enc_shaext: diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha256-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha256-x86_64.s index 0e022a30c0de..77187bc39262 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha256-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha256-x86_64.s @@ -34,6 +34,7 @@ aesni_cbc_sha256_enc: .cfi_endproc .size aesni_cbc_sha256_enc,.-aesni_cbc_sha256_enc +.section .rodata .align 64 .type K256,@object K256: @@ -76,6 +77,7 @@ K256: .long 0,0,0,0, 0,0,0,0 .byte 65,69,83,78,73,45,67,66,67,43,83,72,65,50,53,54,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 +.previous .type aesni_cbc_sha256_enc_xop,@function .align 64 aesni_cbc_sha256_enc_xop: diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-x86_64.s index aa7585f179a8..89774597b341 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-x86_64.s @@ -4461,6 +4461,7 @@ __aesni_set_encrypt_key: .cfi_endproc .size aesni_set_encrypt_key,.-aesni_set_encrypt_key .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key +.section .rodata .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 @@ -4483,6 +4484,7 @@ __aesni_set_encrypt_key: .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 +.previous .section ".note.gnu.property", "a" .p2align 3 .long 1f - 0f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/bsaes-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/bsaes-x86_64.s index 5abda703024f..75766d1e32b3 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/bsaes-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/bsaes-x86_64.s @@ -1571,6 +1571,7 @@ ossl_bsaes_ctr32_encrypt_blocks: .align 16 ossl_bsaes_xts_encrypt: .cfi_startproc +.byte 243,15,30,250 movq %rsp,%rax .Lxts_enc_prologue: pushq %rbp @@ -2046,6 +2047,7 @@ ossl_bsaes_xts_encrypt: .align 16 ossl_bsaes_xts_decrypt: .cfi_startproc +.byte 243,15,30,250 movq %rsp,%rax .Lxts_dec_prologue: pushq %rbp @@ -2541,6 +2543,7 @@ ossl_bsaes_xts_decrypt: .cfi_endproc .size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt .type _bsaes_const,@object +.section .rodata .align 64 _bsaes_const: .LM0ISR: @@ -2592,9 +2595,9 @@ _bsaes_const: .quad 0x02060a0e03070b0f, 0x0004080c0105090d .L63: .quad 0x6363636363636363, 0x6363636363636363 -.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44,32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32,65,110,100,121,32,80,111,108,121,97,107,111,118,0 .align 64 .size _bsaes_const,.-_bsaes_const +.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44,32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32,65,110,100,121,32,80,111,108,121,97,107,111,118,0 .section ".note.gnu.property", "a" .p2align 3 .long 1f - 0f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/vpaes-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/vpaes-x86_64.s index 4bd2e683b9f6..5f4159c858bd 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/vpaes-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/vpaes-x86_64.s @@ -758,6 +758,7 @@ _vpaes_preheat: .type _vpaes_consts,@object +.section .rodata .align 64 _vpaes_consts: .Lk_inv: @@ -853,9 +854,9 @@ _vpaes_consts: .Lk_dsbo: .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C -.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 .align 64 .size _vpaes_consts,.-_vpaes_consts +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 .section ".note.gnu.property", "a" .p2align 3 .long 1f - 0f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-2k-avx512.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-2k-avx512.s new file mode 100644 index 000000000000..1e779c624652 --- /dev/null +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-2k-avx512.s @@ -0,0 +1,916 @@ + +.globl ossl_rsaz_avx512ifma_eligible +.type ossl_rsaz_avx512ifma_eligible,@function +.align 32 +ossl_rsaz_avx512ifma_eligible: + movl OPENSSL_ia32cap_P+8(%rip),%ecx + xorl %eax,%eax + andl $2149777408,%ecx + cmpl $2149777408,%ecx + cmovel %ecx,%eax + .byte 0xf3,0xc3 +.size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible +.text + +.globl ossl_rsaz_amm52x20_x1_ifma256 +.type ossl_rsaz_amm52x20_x1_ifma256,@function +.align 32 +ossl_rsaz_amm52x20_x1_ifma256: +.cfi_startproc +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lossl_rsaz_amm52x20_x1_ifma256_body: + + + vpxord %ymm0,%ymm0,%ymm0 + vmovdqa64 %ymm0,%ymm3 + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm0,%ymm17 + vmovdqa64 %ymm0,%ymm18 + vmovdqa64 %ymm0,%ymm19 + + xorl %r9d,%r9d + + movq %rdx,%r11 + movq $0xfffffffffffff,%rax + + + movl $5,%ebx + +.align 32 +.Lloop5: + movq 0(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm16 + vpmadd52luq 64(%rsi),%ymm1,%ymm17 + vpmadd52luq 96(%rsi),%ymm1,%ymm18 + vpmadd52luq 128(%rsi),%ymm1,%ymm19 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm16 + vpmadd52luq 64(%rcx),%ymm2,%ymm17 + vpmadd52luq 96(%rcx),%ymm2,%ymm18 + vpmadd52luq 128(%rcx),%ymm2,%ymm19 + + + valignq $1,%ymm3,%ymm16,%ymm3 + valignq $1,%ymm16,%ymm17,%ymm16 + valignq $1,%ymm17,%ymm18,%ymm17 + valignq $1,%ymm18,%ymm19,%ymm18 + valignq $1,%ymm19,%ymm0,%ymm19 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm16 + vpmadd52huq 64(%rsi),%ymm1,%ymm17 + vpmadd52huq 96(%rsi),%ymm1,%ymm18 + vpmadd52huq 128(%rsi),%ymm1,%ymm19 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm16 + vpmadd52huq 64(%rcx),%ymm2,%ymm17 + vpmadd52huq 96(%rcx),%ymm2,%ymm18 + vpmadd52huq 128(%rcx),%ymm2,%ymm19 + movq 8(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm16 + vpmadd52luq 64(%rsi),%ymm1,%ymm17 + vpmadd52luq 96(%rsi),%ymm1,%ymm18 + vpmadd52luq 128(%rsi),%ymm1,%ymm19 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm16 + vpmadd52luq 64(%rcx),%ymm2,%ymm17 + vpmadd52luq 96(%rcx),%ymm2,%ymm18 + vpmadd52luq 128(%rcx),%ymm2,%ymm19 + + + valignq $1,%ymm3,%ymm16,%ymm3 + valignq $1,%ymm16,%ymm17,%ymm16 + valignq $1,%ymm17,%ymm18,%ymm17 + valignq $1,%ymm18,%ymm19,%ymm18 + valignq $1,%ymm19,%ymm0,%ymm19 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm16 + vpmadd52huq 64(%rsi),%ymm1,%ymm17 + vpmadd52huq 96(%rsi),%ymm1,%ymm18 + vpmadd52huq 128(%rsi),%ymm1,%ymm19 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm16 + vpmadd52huq 64(%rcx),%ymm2,%ymm17 + vpmadd52huq 96(%rcx),%ymm2,%ymm18 + vpmadd52huq 128(%rcx),%ymm2,%ymm19 + movq 16(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm16 + vpmadd52luq 64(%rsi),%ymm1,%ymm17 + vpmadd52luq 96(%rsi),%ymm1,%ymm18 + vpmadd52luq 128(%rsi),%ymm1,%ymm19 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm16 + vpmadd52luq 64(%rcx),%ymm2,%ymm17 + vpmadd52luq 96(%rcx),%ymm2,%ymm18 + vpmadd52luq 128(%rcx),%ymm2,%ymm19 + + + valignq $1,%ymm3,%ymm16,%ymm3 + valignq $1,%ymm16,%ymm17,%ymm16 + valignq $1,%ymm17,%ymm18,%ymm17 + valignq $1,%ymm18,%ymm19,%ymm18 + valignq $1,%ymm19,%ymm0,%ymm19 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm16 + vpmadd52huq 64(%rsi),%ymm1,%ymm17 + vpmadd52huq 96(%rsi),%ymm1,%ymm18 + vpmadd52huq 128(%rsi),%ymm1,%ymm19 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm16 + vpmadd52huq 64(%rcx),%ymm2,%ymm17 + vpmadd52huq 96(%rcx),%ymm2,%ymm18 + vpmadd52huq 128(%rcx),%ymm2,%ymm19 + movq 24(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm16 + vpmadd52luq 64(%rsi),%ymm1,%ymm17 + vpmadd52luq 96(%rsi),%ymm1,%ymm18 + vpmadd52luq 128(%rsi),%ymm1,%ymm19 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm16 + vpmadd52luq 64(%rcx),%ymm2,%ymm17 + vpmadd52luq 96(%rcx),%ymm2,%ymm18 + vpmadd52luq 128(%rcx),%ymm2,%ymm19 + + + valignq $1,%ymm3,%ymm16,%ymm3 + valignq $1,%ymm16,%ymm17,%ymm16 + valignq $1,%ymm17,%ymm18,%ymm17 + valignq $1,%ymm18,%ymm19,%ymm18 + valignq $1,%ymm19,%ymm0,%ymm19 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm16 + vpmadd52huq 64(%rsi),%ymm1,%ymm17 + vpmadd52huq 96(%rsi),%ymm1,%ymm18 + vpmadd52huq 128(%rsi),%ymm1,%ymm19 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm16 + vpmadd52huq 64(%rcx),%ymm2,%ymm17 + vpmadd52huq 96(%rcx),%ymm2,%ymm18 + vpmadd52huq 128(%rcx),%ymm2,%ymm19 + leaq 32(%r11),%r11 + decl %ebx + jne .Lloop5 + + vpbroadcastq %r9,%ymm0 + vpblendd $3,%ymm0,%ymm3,%ymm3 + + + + vpsrlq $52,%ymm3,%ymm0 + vpsrlq $52,%ymm16,%ymm1 + vpsrlq $52,%ymm17,%ymm2 + vpsrlq $52,%ymm18,%ymm25 + vpsrlq $52,%ymm19,%ymm26 + + + valignq $3,%ymm25,%ymm26,%ymm26 + valignq $3,%ymm2,%ymm25,%ymm25 + valignq $3,%ymm1,%ymm2,%ymm2 + valignq $3,%ymm0,%ymm1,%ymm1 + valignq $3,.Lzeros(%rip),%ymm0,%ymm0 + + + vpandq .Lmask52x4(%rip),%ymm3,%ymm3 + vpandq .Lmask52x4(%rip),%ymm16,%ymm16 + vpandq .Lmask52x4(%rip),%ymm17,%ymm17 + vpandq .Lmask52x4(%rip),%ymm18,%ymm18 + vpandq .Lmask52x4(%rip),%ymm19,%ymm19 + + + vpaddq %ymm0,%ymm3,%ymm3 + vpaddq %ymm1,%ymm16,%ymm16 + vpaddq %ymm2,%ymm17,%ymm17 + vpaddq %ymm25,%ymm18,%ymm18 + vpaddq %ymm26,%ymm19,%ymm19 + + + + vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2 + vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k3 + vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k4 + vpcmpuq $6,.Lmask52x4(%rip),%ymm19,%k5 + kmovb %k1,%r14d + kmovb %k2,%r13d + kmovb %k3,%r12d + kmovb %k4,%r11d + kmovb %k5,%r10d + + + vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2 + vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k3 + vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k4 + vpcmpuq $0,.Lmask52x4(%rip),%ymm19,%k5 + kmovb %k1,%r9d + kmovb %k2,%r8d + kmovb %k3,%ebx + kmovb %k4,%ecx + kmovb %k5,%edx + + + + shlb $4,%r13b + orb %r13b,%r14b + shlb $4,%r11b + orb %r11b,%r12b + + addb %r14b,%r14b + adcb %r12b,%r12b + adcb %r10b,%r10b + + shlb $4,%r8b + orb %r8b,%r9b + shlb $4,%cl + orb %cl,%bl + + addb %r9b,%r14b + adcb %bl,%r12b + adcb %dl,%r10b + + xorb %r9b,%r14b + xorb %bl,%r12b + xorb %dl,%r10b + + kmovb %r14d,%k1 + shrb $4,%r14b + kmovb %r14d,%k2 + kmovb %r12d,%k3 + shrb $4,%r12b + kmovb %r12d,%k4 + kmovb %r10d,%k5 + + + vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1} + vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k2} + vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k3} + vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k4} + vpsubq .Lmask52x4(%rip),%ymm19,%ymm19{%k5} + + vpandq .Lmask52x4(%rip),%ymm3,%ymm3 + vpandq .Lmask52x4(%rip),%ymm16,%ymm16 + vpandq .Lmask52x4(%rip),%ymm17,%ymm17 + vpandq .Lmask52x4(%rip),%ymm18,%ymm18 + vpandq .Lmask52x4(%rip),%ymm19,%ymm19 + + vmovdqu64 %ymm3,0(%rdi) + vmovdqu64 %ymm16,32(%rdi) + vmovdqu64 %ymm17,64(%rdi) + vmovdqu64 %ymm18,96(%rdi) + vmovdqu64 %ymm19,128(%rdi) + + vzeroupper + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbp +.cfi_restore %rbp + movq 40(%rsp),%rbx +.cfi_restore %rbx + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lossl_rsaz_amm52x20_x1_ifma256_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256 +.section .rodata +.align 32 +.Lmask52x4: +.quad 0xfffffffffffff +.quad 0xfffffffffffff +.quad 0xfffffffffffff +.quad 0xfffffffffffff +.text + +.globl ossl_rsaz_amm52x20_x2_ifma256 +.type ossl_rsaz_amm52x20_x2_ifma256,@function +.align 32 +ossl_rsaz_amm52x20_x2_ifma256: +.cfi_startproc +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lossl_rsaz_amm52x20_x2_ifma256_body: + + + vpxord %ymm0,%ymm0,%ymm0 + vmovdqa64 %ymm0,%ymm3 + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm0,%ymm17 + vmovdqa64 %ymm0,%ymm18 + vmovdqa64 %ymm0,%ymm19 + vmovdqa64 %ymm0,%ymm4 + vmovdqa64 %ymm0,%ymm20 + vmovdqa64 %ymm0,%ymm21 + vmovdqa64 %ymm0,%ymm22 + vmovdqa64 %ymm0,%ymm23 + + xorl %r9d,%r9d + xorl %r15d,%r15d + + movq %rdx,%r11 + movq $0xfffffffffffff,%rax + + movl $20,%ebx + +.align 32 +.Lloop20: + movq 0(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq (%r8),%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm16 + vpmadd52luq 64(%rsi),%ymm1,%ymm17 + vpmadd52luq 96(%rsi),%ymm1,%ymm18 + vpmadd52luq 128(%rsi),%ymm1,%ymm19 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm16 + vpmadd52luq 64(%rcx),%ymm2,%ymm17 + vpmadd52luq 96(%rcx),%ymm2,%ymm18 + vpmadd52luq 128(%rcx),%ymm2,%ymm19 + + + valignq $1,%ymm3,%ymm16,%ymm3 + valignq $1,%ymm16,%ymm17,%ymm16 + valignq $1,%ymm17,%ymm18,%ymm17 + valignq $1,%ymm18,%ymm19,%ymm18 + valignq $1,%ymm19,%ymm0,%ymm19 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm16 + vpmadd52huq 64(%rsi),%ymm1,%ymm17 + vpmadd52huq 96(%rsi),%ymm1,%ymm18 + vpmadd52huq 128(%rsi),%ymm1,%ymm19 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm16 + vpmadd52huq 64(%rcx),%ymm2,%ymm17 + vpmadd52huq 96(%rcx),%ymm2,%ymm18 + vpmadd52huq 128(%rcx),%ymm2,%ymm19 + movq 160(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 160(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r15 + movq %r12,%r10 + adcq $0,%r10 + + movq 8(%r8),%r13 + imulq %r15,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 160(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r15 + adcq %r12,%r10 + + shrq $52,%r15 + salq $12,%r10 + orq %r10,%r15 + + vpmadd52luq 160(%rsi),%ymm1,%ymm4 + vpmadd52luq 192(%rsi),%ymm1,%ymm20 + vpmadd52luq 224(%rsi),%ymm1,%ymm21 + vpmadd52luq 256(%rsi),%ymm1,%ymm22 + vpmadd52luq 288(%rsi),%ymm1,%ymm23 + + vpmadd52luq 160(%rcx),%ymm2,%ymm4 + vpmadd52luq 192(%rcx),%ymm2,%ymm20 + vpmadd52luq 224(%rcx),%ymm2,%ymm21 + vpmadd52luq 256(%rcx),%ymm2,%ymm22 + vpmadd52luq 288(%rcx),%ymm2,%ymm23 + + + valignq $1,%ymm4,%ymm20,%ymm4 + valignq $1,%ymm20,%ymm21,%ymm20 + valignq $1,%ymm21,%ymm22,%ymm21 + valignq $1,%ymm22,%ymm23,%ymm22 + valignq $1,%ymm23,%ymm0,%ymm23 + + vmovq %xmm4,%r13 + addq %r13,%r15 + + vpmadd52huq 160(%rsi),%ymm1,%ymm4 + vpmadd52huq 192(%rsi),%ymm1,%ymm20 + vpmadd52huq 224(%rsi),%ymm1,%ymm21 + vpmadd52huq 256(%rsi),%ymm1,%ymm22 + vpmadd52huq 288(%rsi),%ymm1,%ymm23 + + vpmadd52huq 160(%rcx),%ymm2,%ymm4 + vpmadd52huq 192(%rcx),%ymm2,%ymm20 + vpmadd52huq 224(%rcx),%ymm2,%ymm21 + vpmadd52huq 256(%rcx),%ymm2,%ymm22 + vpmadd52huq 288(%rcx),%ymm2,%ymm23 + leaq 8(%r11),%r11 + decl %ebx + jne .Lloop20 + + vpbroadcastq %r9,%ymm0 + vpblendd $3,%ymm0,%ymm3,%ymm3 + + + + vpsrlq $52,%ymm3,%ymm0 + vpsrlq $52,%ymm16,%ymm1 + vpsrlq $52,%ymm17,%ymm2 + vpsrlq $52,%ymm18,%ymm25 + vpsrlq $52,%ymm19,%ymm26 + + + valignq $3,%ymm25,%ymm26,%ymm26 + valignq $3,%ymm2,%ymm25,%ymm25 + valignq $3,%ymm1,%ymm2,%ymm2 + valignq $3,%ymm0,%ymm1,%ymm1 + valignq $3,.Lzeros(%rip),%ymm0,%ymm0 + + + vpandq .Lmask52x4(%rip),%ymm3,%ymm3 + vpandq .Lmask52x4(%rip),%ymm16,%ymm16 + vpandq .Lmask52x4(%rip),%ymm17,%ymm17 + vpandq .Lmask52x4(%rip),%ymm18,%ymm18 + vpandq .Lmask52x4(%rip),%ymm19,%ymm19 + + + vpaddq %ymm0,%ymm3,%ymm3 + vpaddq %ymm1,%ymm16,%ymm16 + vpaddq %ymm2,%ymm17,%ymm17 + vpaddq %ymm25,%ymm18,%ymm18 + vpaddq %ymm26,%ymm19,%ymm19 + + + + vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2 + vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k3 + vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k4 + vpcmpuq $6,.Lmask52x4(%rip),%ymm19,%k5 + kmovb %k1,%r14d + kmovb %k2,%r13d + kmovb %k3,%r12d + kmovb %k4,%r11d + kmovb %k5,%r10d + + + vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2 + vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k3 + vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k4 + vpcmpuq $0,.Lmask52x4(%rip),%ymm19,%k5 + kmovb %k1,%r9d + kmovb %k2,%r8d + kmovb %k3,%ebx + kmovb %k4,%ecx + kmovb %k5,%edx + + + + shlb $4,%r13b + orb %r13b,%r14b + shlb $4,%r11b + orb %r11b,%r12b + + addb %r14b,%r14b + adcb %r12b,%r12b + adcb %r10b,%r10b + + shlb $4,%r8b + orb %r8b,%r9b + shlb $4,%cl + orb %cl,%bl + + addb %r9b,%r14b + adcb %bl,%r12b + adcb %dl,%r10b + + xorb %r9b,%r14b + xorb %bl,%r12b + xorb %dl,%r10b + + kmovb %r14d,%k1 + shrb $4,%r14b + kmovb %r14d,%k2 + kmovb %r12d,%k3 + shrb $4,%r12b + kmovb %r12d,%k4 + kmovb %r10d,%k5 + + + vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1} + vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k2} + vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k3} + vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k4} + vpsubq .Lmask52x4(%rip),%ymm19,%ymm19{%k5} + + vpandq .Lmask52x4(%rip),%ymm3,%ymm3 + vpandq .Lmask52x4(%rip),%ymm16,%ymm16 + vpandq .Lmask52x4(%rip),%ymm17,%ymm17 + vpandq .Lmask52x4(%rip),%ymm18,%ymm18 + vpandq .Lmask52x4(%rip),%ymm19,%ymm19 + + vpbroadcastq %r15,%ymm0 + vpblendd $3,%ymm0,%ymm4,%ymm4 + + + + vpsrlq $52,%ymm4,%ymm0 + vpsrlq $52,%ymm20,%ymm1 + vpsrlq $52,%ymm21,%ymm2 + vpsrlq $52,%ymm22,%ymm25 + vpsrlq $52,%ymm23,%ymm26 + + + valignq $3,%ymm25,%ymm26,%ymm26 + valignq $3,%ymm2,%ymm25,%ymm25 + valignq $3,%ymm1,%ymm2,%ymm2 + valignq $3,%ymm0,%ymm1,%ymm1 + valignq $3,.Lzeros(%rip),%ymm0,%ymm0 + + + vpandq .Lmask52x4(%rip),%ymm4,%ymm4 + vpandq .Lmask52x4(%rip),%ymm20,%ymm20 + vpandq .Lmask52x4(%rip),%ymm21,%ymm21 + vpandq .Lmask52x4(%rip),%ymm22,%ymm22 + vpandq .Lmask52x4(%rip),%ymm23,%ymm23 + + + vpaddq %ymm0,%ymm4,%ymm4 + vpaddq %ymm1,%ymm20,%ymm20 + vpaddq %ymm2,%ymm21,%ymm21 + vpaddq %ymm25,%ymm22,%ymm22 + vpaddq %ymm26,%ymm23,%ymm23 + + + + vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm20,%k2 + vpcmpuq $6,.Lmask52x4(%rip),%ymm21,%k3 + vpcmpuq $6,.Lmask52x4(%rip),%ymm22,%k4 + vpcmpuq $6,.Lmask52x4(%rip),%ymm23,%k5 + kmovb %k1,%r14d + kmovb %k2,%r13d + kmovb %k3,%r12d + kmovb %k4,%r11d + kmovb %k5,%r10d + + + vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm20,%k2 + vpcmpuq $0,.Lmask52x4(%rip),%ymm21,%k3 + vpcmpuq $0,.Lmask52x4(%rip),%ymm22,%k4 + vpcmpuq $0,.Lmask52x4(%rip),%ymm23,%k5 + kmovb %k1,%r9d + kmovb %k2,%r8d + kmovb %k3,%ebx + kmovb %k4,%ecx + kmovb %k5,%edx + + + + shlb $4,%r13b + orb %r13b,%r14b + shlb $4,%r11b + orb %r11b,%r12b + + addb %r14b,%r14b + adcb %r12b,%r12b + adcb %r10b,%r10b + + shlb $4,%r8b + orb %r8b,%r9b + shlb $4,%cl + orb %cl,%bl + + addb %r9b,%r14b + adcb %bl,%r12b + adcb %dl,%r10b + + xorb %r9b,%r14b + xorb %bl,%r12b + xorb %dl,%r10b + + kmovb %r14d,%k1 + shrb $4,%r14b + kmovb %r14d,%k2 + kmovb %r12d,%k3 + shrb $4,%r12b + kmovb %r12d,%k4 + kmovb %r10d,%k5 + + + vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k1} + vpsubq .Lmask52x4(%rip),%ymm20,%ymm20{%k2} + vpsubq .Lmask52x4(%rip),%ymm21,%ymm21{%k3} + vpsubq .Lmask52x4(%rip),%ymm22,%ymm22{%k4} + vpsubq .Lmask52x4(%rip),%ymm23,%ymm23{%k5} + + vpandq .Lmask52x4(%rip),%ymm4,%ymm4 + vpandq .Lmask52x4(%rip),%ymm20,%ymm20 + vpandq .Lmask52x4(%rip),%ymm21,%ymm21 + vpandq .Lmask52x4(%rip),%ymm22,%ymm22 + vpandq .Lmask52x4(%rip),%ymm23,%ymm23 + + vmovdqu64 %ymm3,0(%rdi) + vmovdqu64 %ymm16,32(%rdi) + vmovdqu64 %ymm17,64(%rdi) + vmovdqu64 %ymm18,96(%rdi) + vmovdqu64 %ymm19,128(%rdi) + + vmovdqu64 %ymm4,160(%rdi) + vmovdqu64 %ymm20,192(%rdi) + vmovdqu64 %ymm21,224(%rdi) + vmovdqu64 %ymm22,256(%rdi) + vmovdqu64 %ymm23,288(%rdi) + + vzeroupper + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbp +.cfi_restore %rbp + movq 40(%rsp),%rbx +.cfi_restore %rbx + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lossl_rsaz_amm52x20_x2_ifma256_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_rsaz_amm52x20_x2_ifma256, .-ossl_rsaz_amm52x20_x2_ifma256 +.text + +.align 32 +.globl ossl_extract_multiplier_2x20_win5 +.type ossl_extract_multiplier_2x20_win5,@function +ossl_extract_multiplier_2x20_win5: +.cfi_startproc +.byte 243,15,30,250 + vmovdqa64 .Lones(%rip),%ymm24 + vpbroadcastq %rdx,%ymm22 + vpbroadcastq %rcx,%ymm23 + leaq 10240(%rsi),%rax + + + vpxor %xmm0,%xmm0,%xmm0 + vmovdqa64 %ymm0,%ymm21 + vmovdqa64 %ymm0,%ymm1 + vmovdqa64 %ymm0,%ymm2 + vmovdqa64 %ymm0,%ymm3 + vmovdqa64 %ymm0,%ymm4 + vmovdqa64 %ymm0,%ymm5 + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm0,%ymm17 + vmovdqa64 %ymm0,%ymm18 + vmovdqa64 %ymm0,%ymm19 + +.align 32 +.Lloop: + vpcmpq $0,%ymm21,%ymm22,%k1 + vpcmpq $0,%ymm21,%ymm23,%k2 + vmovdqu64 0(%rsi),%ymm20 + vpblendmq %ymm20,%ymm0,%ymm0{%k1} + vmovdqu64 32(%rsi),%ymm20 + vpblendmq %ymm20,%ymm1,%ymm1{%k1} + vmovdqu64 64(%rsi),%ymm20 + vpblendmq %ymm20,%ymm2,%ymm2{%k1} + vmovdqu64 96(%rsi),%ymm20 + vpblendmq %ymm20,%ymm3,%ymm3{%k1} + vmovdqu64 128(%rsi),%ymm20 + vpblendmq %ymm20,%ymm4,%ymm4{%k1} + vmovdqu64 160(%rsi),%ymm20 + vpblendmq %ymm20,%ymm5,%ymm5{%k2} + vmovdqu64 192(%rsi),%ymm20 + vpblendmq %ymm20,%ymm16,%ymm16{%k2} + vmovdqu64 224(%rsi),%ymm20 + vpblendmq %ymm20,%ymm17,%ymm17{%k2} + vmovdqu64 256(%rsi),%ymm20 + vpblendmq %ymm20,%ymm18,%ymm18{%k2} + vmovdqu64 288(%rsi),%ymm20 + vpblendmq %ymm20,%ymm19,%ymm19{%k2} + vpaddq %ymm24,%ymm21,%ymm21 + addq $320,%rsi + cmpq %rsi,%rax + jne .Lloop + vmovdqu64 %ymm0,0(%rdi) + vmovdqu64 %ymm1,32(%rdi) + vmovdqu64 %ymm2,64(%rdi) + vmovdqu64 %ymm3,96(%rdi) + vmovdqu64 %ymm4,128(%rdi) + vmovdqu64 %ymm5,160(%rdi) + vmovdqu64 %ymm16,192(%rdi) + vmovdqu64 %ymm17,224(%rdi) + vmovdqu64 %ymm18,256(%rdi) + vmovdqu64 %ymm19,288(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5 +.section .rodata +.align 32 +.Lones: +.quad 1,1,1,1 +.Lzeros: +.quad 0,0,0,0 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-3k-avx512.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-3k-avx512.s new file mode 100644 index 000000000000..408a6489292a --- /dev/null +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-3k-avx512.s @@ -0,0 +1,1331 @@ +.text + +.globl ossl_rsaz_amm52x30_x1_ifma256 +.type ossl_rsaz_amm52x30_x1_ifma256,@function +.align 32 +ossl_rsaz_amm52x30_x1_ifma256: +.cfi_startproc +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + vpxord %ymm0,%ymm0,%ymm0 + vmovdqa64 %ymm0,%ymm3 + vmovdqa64 %ymm0,%ymm4 + vmovdqa64 %ymm0,%ymm5 + vmovdqa64 %ymm0,%ymm6 + vmovdqa64 %ymm0,%ymm7 + vmovdqa64 %ymm0,%ymm8 + vmovdqa64 %ymm0,%ymm9 + vmovdqa64 %ymm0,%ymm10 + + xorl %r9d,%r9d + + movq %rdx,%r11 + movq $0xfffffffffffff,%rax + + + movl $7,%ebx + +.align 32 +.Lloop7: + movq 0(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm4 + vpmadd52luq 64(%rsi),%ymm1,%ymm5 + vpmadd52luq 96(%rsi),%ymm1,%ymm6 + vpmadd52luq 128(%rsi),%ymm1,%ymm7 + vpmadd52luq 160(%rsi),%ymm1,%ymm8 + vpmadd52luq 192(%rsi),%ymm1,%ymm9 + vpmadd52luq 224(%rsi),%ymm1,%ymm10 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm4 + vpmadd52luq 64(%rcx),%ymm2,%ymm5 + vpmadd52luq 96(%rcx),%ymm2,%ymm6 + vpmadd52luq 128(%rcx),%ymm2,%ymm7 + vpmadd52luq 160(%rcx),%ymm2,%ymm8 + vpmadd52luq 192(%rcx),%ymm2,%ymm9 + vpmadd52luq 224(%rcx),%ymm2,%ymm10 + + + valignq $1,%ymm3,%ymm4,%ymm3 + valignq $1,%ymm4,%ymm5,%ymm4 + valignq $1,%ymm5,%ymm6,%ymm5 + valignq $1,%ymm6,%ymm7,%ymm6 + valignq $1,%ymm7,%ymm8,%ymm7 + valignq $1,%ymm8,%ymm9,%ymm8 + valignq $1,%ymm9,%ymm10,%ymm9 + valignq $1,%ymm10,%ymm0,%ymm10 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm4 + vpmadd52huq 64(%rsi),%ymm1,%ymm5 + vpmadd52huq 96(%rsi),%ymm1,%ymm6 + vpmadd52huq 128(%rsi),%ymm1,%ymm7 + vpmadd52huq 160(%rsi),%ymm1,%ymm8 + vpmadd52huq 192(%rsi),%ymm1,%ymm9 + vpmadd52huq 224(%rsi),%ymm1,%ymm10 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm4 + vpmadd52huq 64(%rcx),%ymm2,%ymm5 + vpmadd52huq 96(%rcx),%ymm2,%ymm6 + vpmadd52huq 128(%rcx),%ymm2,%ymm7 + vpmadd52huq 160(%rcx),%ymm2,%ymm8 + vpmadd52huq 192(%rcx),%ymm2,%ymm9 + vpmadd52huq 224(%rcx),%ymm2,%ymm10 + movq 8(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm4 + vpmadd52luq 64(%rsi),%ymm1,%ymm5 + vpmadd52luq 96(%rsi),%ymm1,%ymm6 + vpmadd52luq 128(%rsi),%ymm1,%ymm7 + vpmadd52luq 160(%rsi),%ymm1,%ymm8 + vpmadd52luq 192(%rsi),%ymm1,%ymm9 + vpmadd52luq 224(%rsi),%ymm1,%ymm10 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm4 + vpmadd52luq 64(%rcx),%ymm2,%ymm5 + vpmadd52luq 96(%rcx),%ymm2,%ymm6 + vpmadd52luq 128(%rcx),%ymm2,%ymm7 + vpmadd52luq 160(%rcx),%ymm2,%ymm8 + vpmadd52luq 192(%rcx),%ymm2,%ymm9 + vpmadd52luq 224(%rcx),%ymm2,%ymm10 + + + valignq $1,%ymm3,%ymm4,%ymm3 + valignq $1,%ymm4,%ymm5,%ymm4 + valignq $1,%ymm5,%ymm6,%ymm5 + valignq $1,%ymm6,%ymm7,%ymm6 + valignq $1,%ymm7,%ymm8,%ymm7 + valignq $1,%ymm8,%ymm9,%ymm8 + valignq $1,%ymm9,%ymm10,%ymm9 + valignq $1,%ymm10,%ymm0,%ymm10 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm4 + vpmadd52huq 64(%rsi),%ymm1,%ymm5 + vpmadd52huq 96(%rsi),%ymm1,%ymm6 + vpmadd52huq 128(%rsi),%ymm1,%ymm7 + vpmadd52huq 160(%rsi),%ymm1,%ymm8 + vpmadd52huq 192(%rsi),%ymm1,%ymm9 + vpmadd52huq 224(%rsi),%ymm1,%ymm10 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm4 + vpmadd52huq 64(%rcx),%ymm2,%ymm5 + vpmadd52huq 96(%rcx),%ymm2,%ymm6 + vpmadd52huq 128(%rcx),%ymm2,%ymm7 + vpmadd52huq 160(%rcx),%ymm2,%ymm8 + vpmadd52huq 192(%rcx),%ymm2,%ymm9 + vpmadd52huq 224(%rcx),%ymm2,%ymm10 + movq 16(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm4 + vpmadd52luq 64(%rsi),%ymm1,%ymm5 + vpmadd52luq 96(%rsi),%ymm1,%ymm6 + vpmadd52luq 128(%rsi),%ymm1,%ymm7 + vpmadd52luq 160(%rsi),%ymm1,%ymm8 + vpmadd52luq 192(%rsi),%ymm1,%ymm9 + vpmadd52luq 224(%rsi),%ymm1,%ymm10 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm4 + vpmadd52luq 64(%rcx),%ymm2,%ymm5 + vpmadd52luq 96(%rcx),%ymm2,%ymm6 + vpmadd52luq 128(%rcx),%ymm2,%ymm7 + vpmadd52luq 160(%rcx),%ymm2,%ymm8 + vpmadd52luq 192(%rcx),%ymm2,%ymm9 + vpmadd52luq 224(%rcx),%ymm2,%ymm10 + + + valignq $1,%ymm3,%ymm4,%ymm3 + valignq $1,%ymm4,%ymm5,%ymm4 + valignq $1,%ymm5,%ymm6,%ymm5 + valignq $1,%ymm6,%ymm7,%ymm6 + valignq $1,%ymm7,%ymm8,%ymm7 + valignq $1,%ymm8,%ymm9,%ymm8 + valignq $1,%ymm9,%ymm10,%ymm9 + valignq $1,%ymm10,%ymm0,%ymm10 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm4 + vpmadd52huq 64(%rsi),%ymm1,%ymm5 + vpmadd52huq 96(%rsi),%ymm1,%ymm6 + vpmadd52huq 128(%rsi),%ymm1,%ymm7 + vpmadd52huq 160(%rsi),%ymm1,%ymm8 + vpmadd52huq 192(%rsi),%ymm1,%ymm9 + vpmadd52huq 224(%rsi),%ymm1,%ymm10 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm4 + vpmadd52huq 64(%rcx),%ymm2,%ymm5 + vpmadd52huq 96(%rcx),%ymm2,%ymm6 + vpmadd52huq 128(%rcx),%ymm2,%ymm7 + vpmadd52huq 160(%rcx),%ymm2,%ymm8 + vpmadd52huq 192(%rcx),%ymm2,%ymm9 + vpmadd52huq 224(%rcx),%ymm2,%ymm10 + movq 24(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm4 + vpmadd52luq 64(%rsi),%ymm1,%ymm5 + vpmadd52luq 96(%rsi),%ymm1,%ymm6 + vpmadd52luq 128(%rsi),%ymm1,%ymm7 + vpmadd52luq 160(%rsi),%ymm1,%ymm8 + vpmadd52luq 192(%rsi),%ymm1,%ymm9 + vpmadd52luq 224(%rsi),%ymm1,%ymm10 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm4 + vpmadd52luq 64(%rcx),%ymm2,%ymm5 + vpmadd52luq 96(%rcx),%ymm2,%ymm6 + vpmadd52luq 128(%rcx),%ymm2,%ymm7 + vpmadd52luq 160(%rcx),%ymm2,%ymm8 + vpmadd52luq 192(%rcx),%ymm2,%ymm9 + vpmadd52luq 224(%rcx),%ymm2,%ymm10 + + + valignq $1,%ymm3,%ymm4,%ymm3 + valignq $1,%ymm4,%ymm5,%ymm4 + valignq $1,%ymm5,%ymm6,%ymm5 + valignq $1,%ymm6,%ymm7,%ymm6 + valignq $1,%ymm7,%ymm8,%ymm7 + valignq $1,%ymm8,%ymm9,%ymm8 + valignq $1,%ymm9,%ymm10,%ymm9 + valignq $1,%ymm10,%ymm0,%ymm10 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm4 + vpmadd52huq 64(%rsi),%ymm1,%ymm5 + vpmadd52huq 96(%rsi),%ymm1,%ymm6 + vpmadd52huq 128(%rsi),%ymm1,%ymm7 + vpmadd52huq 160(%rsi),%ymm1,%ymm8 + vpmadd52huq 192(%rsi),%ymm1,%ymm9 + vpmadd52huq 224(%rsi),%ymm1,%ymm10 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm4 + vpmadd52huq 64(%rcx),%ymm2,%ymm5 + vpmadd52huq 96(%rcx),%ymm2,%ymm6 + vpmadd52huq 128(%rcx),%ymm2,%ymm7 + vpmadd52huq 160(%rcx),%ymm2,%ymm8 + vpmadd52huq 192(%rcx),%ymm2,%ymm9 + vpmadd52huq 224(%rcx),%ymm2,%ymm10 + leaq 32(%r11),%r11 + decl %ebx + jne .Lloop7 + movq 0(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm4 + vpmadd52luq 64(%rsi),%ymm1,%ymm5 + vpmadd52luq 96(%rsi),%ymm1,%ymm6 + vpmadd52luq 128(%rsi),%ymm1,%ymm7 + vpmadd52luq 160(%rsi),%ymm1,%ymm8 + vpmadd52luq 192(%rsi),%ymm1,%ymm9 + vpmadd52luq 224(%rsi),%ymm1,%ymm10 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm4 + vpmadd52luq 64(%rcx),%ymm2,%ymm5 + vpmadd52luq 96(%rcx),%ymm2,%ymm6 + vpmadd52luq 128(%rcx),%ymm2,%ymm7 + vpmadd52luq 160(%rcx),%ymm2,%ymm8 + vpmadd52luq 192(%rcx),%ymm2,%ymm9 + vpmadd52luq 224(%rcx),%ymm2,%ymm10 + + + valignq $1,%ymm3,%ymm4,%ymm3 + valignq $1,%ymm4,%ymm5,%ymm4 + valignq $1,%ymm5,%ymm6,%ymm5 + valignq $1,%ymm6,%ymm7,%ymm6 + valignq $1,%ymm7,%ymm8,%ymm7 + valignq $1,%ymm8,%ymm9,%ymm8 + valignq $1,%ymm9,%ymm10,%ymm9 + valignq $1,%ymm10,%ymm0,%ymm10 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm4 + vpmadd52huq 64(%rsi),%ymm1,%ymm5 + vpmadd52huq 96(%rsi),%ymm1,%ymm6 + vpmadd52huq 128(%rsi),%ymm1,%ymm7 + vpmadd52huq 160(%rsi),%ymm1,%ymm8 + vpmadd52huq 192(%rsi),%ymm1,%ymm9 + vpmadd52huq 224(%rsi),%ymm1,%ymm10 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm4 + vpmadd52huq 64(%rcx),%ymm2,%ymm5 + vpmadd52huq 96(%rcx),%ymm2,%ymm6 + vpmadd52huq 128(%rcx),%ymm2,%ymm7 + vpmadd52huq 160(%rcx),%ymm2,%ymm8 + vpmadd52huq 192(%rcx),%ymm2,%ymm9 + vpmadd52huq 224(%rcx),%ymm2,%ymm10 + movq 8(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm4 + vpmadd52luq 64(%rsi),%ymm1,%ymm5 + vpmadd52luq 96(%rsi),%ymm1,%ymm6 + vpmadd52luq 128(%rsi),%ymm1,%ymm7 + vpmadd52luq 160(%rsi),%ymm1,%ymm8 + vpmadd52luq 192(%rsi),%ymm1,%ymm9 + vpmadd52luq 224(%rsi),%ymm1,%ymm10 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm4 + vpmadd52luq 64(%rcx),%ymm2,%ymm5 + vpmadd52luq 96(%rcx),%ymm2,%ymm6 + vpmadd52luq 128(%rcx),%ymm2,%ymm7 + vpmadd52luq 160(%rcx),%ymm2,%ymm8 + vpmadd52luq 192(%rcx),%ymm2,%ymm9 + vpmadd52luq 224(%rcx),%ymm2,%ymm10 + + + valignq $1,%ymm3,%ymm4,%ymm3 + valignq $1,%ymm4,%ymm5,%ymm4 + valignq $1,%ymm5,%ymm6,%ymm5 + valignq $1,%ymm6,%ymm7,%ymm6 + valignq $1,%ymm7,%ymm8,%ymm7 + valignq $1,%ymm8,%ymm9,%ymm8 + valignq $1,%ymm9,%ymm10,%ymm9 + valignq $1,%ymm10,%ymm0,%ymm10 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm4 + vpmadd52huq 64(%rsi),%ymm1,%ymm5 + vpmadd52huq 96(%rsi),%ymm1,%ymm6 + vpmadd52huq 128(%rsi),%ymm1,%ymm7 + vpmadd52huq 160(%rsi),%ymm1,%ymm8 + vpmadd52huq 192(%rsi),%ymm1,%ymm9 + vpmadd52huq 224(%rsi),%ymm1,%ymm10 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm4 + vpmadd52huq 64(%rcx),%ymm2,%ymm5 + vpmadd52huq 96(%rcx),%ymm2,%ymm6 + vpmadd52huq 128(%rcx),%ymm2,%ymm7 + vpmadd52huq 160(%rcx),%ymm2,%ymm8 + vpmadd52huq 192(%rcx),%ymm2,%ymm9 + vpmadd52huq 224(%rcx),%ymm2,%ymm10 + + vpbroadcastq %r9,%ymm0 + vpblendd $3,%ymm0,%ymm3,%ymm3 + + + + vpsrlq $52,%ymm3,%ymm0 + vpsrlq $52,%ymm4,%ymm1 + vpsrlq $52,%ymm5,%ymm2 + vpsrlq $52,%ymm6,%ymm19 + vpsrlq $52,%ymm7,%ymm20 + vpsrlq $52,%ymm8,%ymm21 + vpsrlq $52,%ymm9,%ymm22 + vpsrlq $52,%ymm10,%ymm23 + + + valignq $3,%ymm22,%ymm23,%ymm23 + valignq $3,%ymm21,%ymm22,%ymm22 + valignq $3,%ymm20,%ymm21,%ymm21 + valignq $3,%ymm19,%ymm20,%ymm20 + valignq $3,%ymm2,%ymm19,%ymm19 + valignq $3,%ymm1,%ymm2,%ymm2 + valignq $3,%ymm0,%ymm1,%ymm1 + valignq $3,.Lzeros(%rip),%ymm0,%ymm0 + + + vpandq .Lmask52x4(%rip),%ymm3,%ymm3 + vpandq .Lmask52x4(%rip),%ymm4,%ymm4 + vpandq .Lmask52x4(%rip),%ymm5,%ymm5 + vpandq .Lmask52x4(%rip),%ymm6,%ymm6 + vpandq .Lmask52x4(%rip),%ymm7,%ymm7 + vpandq .Lmask52x4(%rip),%ymm8,%ymm8 + vpandq .Lmask52x4(%rip),%ymm9,%ymm9 + vpandq .Lmask52x4(%rip),%ymm10,%ymm10 + + + vpaddq %ymm0,%ymm3,%ymm3 + vpaddq %ymm1,%ymm4,%ymm4 + vpaddq %ymm2,%ymm5,%ymm5 + vpaddq %ymm19,%ymm6,%ymm6 + vpaddq %ymm20,%ymm7,%ymm7 + vpaddq %ymm21,%ymm8,%ymm8 + vpaddq %ymm22,%ymm9,%ymm9 + vpaddq %ymm23,%ymm10,%ymm10 + + + + vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k2 + kmovb %k1,%r14d + kmovb %k2,%r13d + shlb $4,%r13b + orb %r13b,%r14b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm5,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm6,%k2 + kmovb %k1,%r13d + kmovb %k2,%r12d + shlb $4,%r12b + orb %r12b,%r13b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm7,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm8,%k2 + kmovb %k1,%r12d + kmovb %k2,%r11d + shlb $4,%r11b + orb %r11b,%r12b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm9,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm10,%k2 + kmovb %k1,%r11d + kmovb %k2,%r10d + shlb $4,%r10b + orb %r10b,%r11b + + addb %r14b,%r14b + adcb %r13b,%r13b + adcb %r12b,%r12b + adcb %r11b,%r11b + + + vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k2 + kmovb %k1,%r9d + kmovb %k2,%r8d + shlb $4,%r8b + orb %r8b,%r9b + + vpcmpuq $0,.Lmask52x4(%rip),%ymm5,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm6,%k2 + kmovb %k1,%r8d + kmovb %k2,%edx + shlb $4,%dl + orb %dl,%r8b + + vpcmpuq $0,.Lmask52x4(%rip),%ymm7,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm8,%k2 + kmovb %k1,%edx + kmovb %k2,%ecx + shlb $4,%cl + orb %cl,%dl + + vpcmpuq $0,.Lmask52x4(%rip),%ymm9,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm10,%k2 + kmovb %k1,%ecx + kmovb %k2,%ebx + shlb $4,%bl + orb %bl,%cl + + addb %r9b,%r14b + adcb %r8b,%r13b + adcb %dl,%r12b + adcb %cl,%r11b + + xorb %r9b,%r14b + xorb %r8b,%r13b + xorb %dl,%r12b + xorb %cl,%r11b + + kmovb %r14d,%k1 + shrb $4,%r14b + kmovb %r14d,%k2 + kmovb %r13d,%k3 + shrb $4,%r13b + kmovb %r13d,%k4 + kmovb %r12d,%k5 + shrb $4,%r12b + kmovb %r12d,%k6 + kmovb %r11d,%k7 + + vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1} + vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k2} + vpsubq .Lmask52x4(%rip),%ymm5,%ymm5{%k3} + vpsubq .Lmask52x4(%rip),%ymm6,%ymm6{%k4} + vpsubq .Lmask52x4(%rip),%ymm7,%ymm7{%k5} + vpsubq .Lmask52x4(%rip),%ymm8,%ymm8{%k6} + vpsubq .Lmask52x4(%rip),%ymm9,%ymm9{%k7} + + vpandq .Lmask52x4(%rip),%ymm3,%ymm3 + vpandq .Lmask52x4(%rip),%ymm4,%ymm4 + vpandq .Lmask52x4(%rip),%ymm5,%ymm5 + vpandq .Lmask52x4(%rip),%ymm6,%ymm6 + vpandq .Lmask52x4(%rip),%ymm7,%ymm7 + vpandq .Lmask52x4(%rip),%ymm8,%ymm8 + vpandq .Lmask52x4(%rip),%ymm9,%ymm9 + + shrb $4,%r11b + kmovb %r11d,%k1 + + vpsubq .Lmask52x4(%rip),%ymm10,%ymm10{%k1} + + vpandq .Lmask52x4(%rip),%ymm10,%ymm10 + + vmovdqu64 %ymm3,0(%rdi) + vmovdqu64 %ymm4,32(%rdi) + vmovdqu64 %ymm5,64(%rdi) + vmovdqu64 %ymm6,96(%rdi) + vmovdqu64 %ymm7,128(%rdi) + vmovdqu64 %ymm8,160(%rdi) + vmovdqu64 %ymm9,192(%rdi) + vmovdqu64 %ymm10,224(%rdi) + + vzeroupper + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + movq 0(%rax),%r15 +.cfi_restore %r15 + movq 8(%rax),%r14 +.cfi_restore %r14 + movq 16(%rax),%r13 +.cfi_restore %r13 + movq 24(%rax),%r12 +.cfi_restore %r12 + movq 32(%rax),%rbp +.cfi_restore %rbp + movq 40(%rax),%rbx +.cfi_restore %rbx + leaq 48(%rax),%rsp +.cfi_def_cfa %rsp,8 +.Lossl_rsaz_amm52x30_x1_ifma256_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_rsaz_amm52x30_x1_ifma256, .-ossl_rsaz_amm52x30_x1_ifma256 +.section .rodata +.align 32 +.Lmask52x4: +.quad 0xfffffffffffff +.quad 0xfffffffffffff +.quad 0xfffffffffffff +.quad 0xfffffffffffff +.text + +.globl ossl_rsaz_amm52x30_x2_ifma256 +.type ossl_rsaz_amm52x30_x2_ifma256,@function +.align 32 +ossl_rsaz_amm52x30_x2_ifma256: +.cfi_startproc +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + vpxord %ymm0,%ymm0,%ymm0 + vmovdqa64 %ymm0,%ymm3 + vmovdqa64 %ymm0,%ymm4 + vmovdqa64 %ymm0,%ymm5 + vmovdqa64 %ymm0,%ymm6 + vmovdqa64 %ymm0,%ymm7 + vmovdqa64 %ymm0,%ymm8 + vmovdqa64 %ymm0,%ymm9 + vmovdqa64 %ymm0,%ymm10 + + vmovdqa64 %ymm0,%ymm11 + vmovdqa64 %ymm0,%ymm12 + vmovdqa64 %ymm0,%ymm13 + vmovdqa64 %ymm0,%ymm14 + vmovdqa64 %ymm0,%ymm15 + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm0,%ymm17 + vmovdqa64 %ymm0,%ymm18 + + + xorl %r9d,%r9d + xorl %r15d,%r15d + + movq %rdx,%r11 + movq $0xfffffffffffff,%rax + + movl $30,%ebx + +.align 32 +.Lloop30: + movq 0(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq (%r8),%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm4 + vpmadd52luq 64(%rsi),%ymm1,%ymm5 + vpmadd52luq 96(%rsi),%ymm1,%ymm6 + vpmadd52luq 128(%rsi),%ymm1,%ymm7 + vpmadd52luq 160(%rsi),%ymm1,%ymm8 + vpmadd52luq 192(%rsi),%ymm1,%ymm9 + vpmadd52luq 224(%rsi),%ymm1,%ymm10 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm4 + vpmadd52luq 64(%rcx),%ymm2,%ymm5 + vpmadd52luq 96(%rcx),%ymm2,%ymm6 + vpmadd52luq 128(%rcx),%ymm2,%ymm7 + vpmadd52luq 160(%rcx),%ymm2,%ymm8 + vpmadd52luq 192(%rcx),%ymm2,%ymm9 + vpmadd52luq 224(%rcx),%ymm2,%ymm10 + + + valignq $1,%ymm3,%ymm4,%ymm3 + valignq $1,%ymm4,%ymm5,%ymm4 + valignq $1,%ymm5,%ymm6,%ymm5 + valignq $1,%ymm6,%ymm7,%ymm6 + valignq $1,%ymm7,%ymm8,%ymm7 + valignq $1,%ymm8,%ymm9,%ymm8 + valignq $1,%ymm9,%ymm10,%ymm9 + valignq $1,%ymm10,%ymm0,%ymm10 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm4 + vpmadd52huq 64(%rsi),%ymm1,%ymm5 + vpmadd52huq 96(%rsi),%ymm1,%ymm6 + vpmadd52huq 128(%rsi),%ymm1,%ymm7 + vpmadd52huq 160(%rsi),%ymm1,%ymm8 + vpmadd52huq 192(%rsi),%ymm1,%ymm9 + vpmadd52huq 224(%rsi),%ymm1,%ymm10 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm4 + vpmadd52huq 64(%rcx),%ymm2,%ymm5 + vpmadd52huq 96(%rcx),%ymm2,%ymm6 + vpmadd52huq 128(%rcx),%ymm2,%ymm7 + vpmadd52huq 160(%rcx),%ymm2,%ymm8 + vpmadd52huq 192(%rcx),%ymm2,%ymm9 + vpmadd52huq 224(%rcx),%ymm2,%ymm10 + movq 256(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 256(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r15 + movq %r12,%r10 + adcq $0,%r10 + + movq 8(%r8),%r13 + imulq %r15,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 256(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r15 + adcq %r12,%r10 + + shrq $52,%r15 + salq $12,%r10 + orq %r10,%r15 + + vpmadd52luq 256(%rsi),%ymm1,%ymm11 + vpmadd52luq 288(%rsi),%ymm1,%ymm12 + vpmadd52luq 320(%rsi),%ymm1,%ymm13 + vpmadd52luq 352(%rsi),%ymm1,%ymm14 + vpmadd52luq 384(%rsi),%ymm1,%ymm15 + vpmadd52luq 416(%rsi),%ymm1,%ymm16 + vpmadd52luq 448(%rsi),%ymm1,%ymm17 + vpmadd52luq 480(%rsi),%ymm1,%ymm18 + + vpmadd52luq 256(%rcx),%ymm2,%ymm11 + vpmadd52luq 288(%rcx),%ymm2,%ymm12 + vpmadd52luq 320(%rcx),%ymm2,%ymm13 + vpmadd52luq 352(%rcx),%ymm2,%ymm14 + vpmadd52luq 384(%rcx),%ymm2,%ymm15 + vpmadd52luq 416(%rcx),%ymm2,%ymm16 + vpmadd52luq 448(%rcx),%ymm2,%ymm17 + vpmadd52luq 480(%rcx),%ymm2,%ymm18 + + + valignq $1,%ymm11,%ymm12,%ymm11 + valignq $1,%ymm12,%ymm13,%ymm12 + valignq $1,%ymm13,%ymm14,%ymm13 + valignq $1,%ymm14,%ymm15,%ymm14 + valignq $1,%ymm15,%ymm16,%ymm15 + valignq $1,%ymm16,%ymm17,%ymm16 + valignq $1,%ymm17,%ymm18,%ymm17 + valignq $1,%ymm18,%ymm0,%ymm18 + + vmovq %xmm11,%r13 + addq %r13,%r15 + + vpmadd52huq 256(%rsi),%ymm1,%ymm11 + vpmadd52huq 288(%rsi),%ymm1,%ymm12 + vpmadd52huq 320(%rsi),%ymm1,%ymm13 + vpmadd52huq 352(%rsi),%ymm1,%ymm14 + vpmadd52huq 384(%rsi),%ymm1,%ymm15 + vpmadd52huq 416(%rsi),%ymm1,%ymm16 + vpmadd52huq 448(%rsi),%ymm1,%ymm17 + vpmadd52huq 480(%rsi),%ymm1,%ymm18 + + vpmadd52huq 256(%rcx),%ymm2,%ymm11 + vpmadd52huq 288(%rcx),%ymm2,%ymm12 + vpmadd52huq 320(%rcx),%ymm2,%ymm13 + vpmadd52huq 352(%rcx),%ymm2,%ymm14 + vpmadd52huq 384(%rcx),%ymm2,%ymm15 + vpmadd52huq 416(%rcx),%ymm2,%ymm16 + vpmadd52huq 448(%rcx),%ymm2,%ymm17 + vpmadd52huq 480(%rcx),%ymm2,%ymm18 + leaq 8(%r11),%r11 + decl %ebx + jne .Lloop30 + + vpbroadcastq %r9,%ymm0 + vpblendd $3,%ymm0,%ymm3,%ymm3 + + + + vpsrlq $52,%ymm3,%ymm0 + vpsrlq $52,%ymm4,%ymm1 + vpsrlq $52,%ymm5,%ymm2 + vpsrlq $52,%ymm6,%ymm19 + vpsrlq $52,%ymm7,%ymm20 + vpsrlq $52,%ymm8,%ymm21 + vpsrlq $52,%ymm9,%ymm22 + vpsrlq $52,%ymm10,%ymm23 + + + valignq $3,%ymm22,%ymm23,%ymm23 + valignq $3,%ymm21,%ymm22,%ymm22 + valignq $3,%ymm20,%ymm21,%ymm21 + valignq $3,%ymm19,%ymm20,%ymm20 + valignq $3,%ymm2,%ymm19,%ymm19 + valignq $3,%ymm1,%ymm2,%ymm2 + valignq $3,%ymm0,%ymm1,%ymm1 + valignq $3,.Lzeros(%rip),%ymm0,%ymm0 + + + vpandq .Lmask52x4(%rip),%ymm3,%ymm3 + vpandq .Lmask52x4(%rip),%ymm4,%ymm4 + vpandq .Lmask52x4(%rip),%ymm5,%ymm5 + vpandq .Lmask52x4(%rip),%ymm6,%ymm6 + vpandq .Lmask52x4(%rip),%ymm7,%ymm7 + vpandq .Lmask52x4(%rip),%ymm8,%ymm8 + vpandq .Lmask52x4(%rip),%ymm9,%ymm9 + vpandq .Lmask52x4(%rip),%ymm10,%ymm10 + + + vpaddq %ymm0,%ymm3,%ymm3 + vpaddq %ymm1,%ymm4,%ymm4 + vpaddq %ymm2,%ymm5,%ymm5 + vpaddq %ymm19,%ymm6,%ymm6 + vpaddq %ymm20,%ymm7,%ymm7 + vpaddq %ymm21,%ymm8,%ymm8 + vpaddq %ymm22,%ymm9,%ymm9 + vpaddq %ymm23,%ymm10,%ymm10 + + + + vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k2 + kmovb %k1,%r14d + kmovb %k2,%r13d + shlb $4,%r13b + orb %r13b,%r14b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm5,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm6,%k2 + kmovb %k1,%r13d + kmovb %k2,%r12d + shlb $4,%r12b + orb %r12b,%r13b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm7,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm8,%k2 + kmovb %k1,%r12d + kmovb %k2,%r11d + shlb $4,%r11b + orb %r11b,%r12b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm9,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm10,%k2 + kmovb %k1,%r11d + kmovb %k2,%r10d + shlb $4,%r10b + orb %r10b,%r11b + + addb %r14b,%r14b + adcb %r13b,%r13b + adcb %r12b,%r12b + adcb %r11b,%r11b + + + vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k2 + kmovb %k1,%r9d + kmovb %k2,%r8d + shlb $4,%r8b + orb %r8b,%r9b + + vpcmpuq $0,.Lmask52x4(%rip),%ymm5,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm6,%k2 + kmovb %k1,%r8d + kmovb %k2,%edx + shlb $4,%dl + orb %dl,%r8b + + vpcmpuq $0,.Lmask52x4(%rip),%ymm7,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm8,%k2 + kmovb %k1,%edx + kmovb %k2,%ecx + shlb $4,%cl + orb %cl,%dl + + vpcmpuq $0,.Lmask52x4(%rip),%ymm9,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm10,%k2 + kmovb %k1,%ecx + kmovb %k2,%ebx + shlb $4,%bl + orb %bl,%cl + + addb %r9b,%r14b + adcb %r8b,%r13b + adcb %dl,%r12b + adcb %cl,%r11b + + xorb %r9b,%r14b + xorb %r8b,%r13b + xorb %dl,%r12b + xorb %cl,%r11b + + kmovb %r14d,%k1 + shrb $4,%r14b + kmovb %r14d,%k2 + kmovb %r13d,%k3 + shrb $4,%r13b + kmovb %r13d,%k4 + kmovb %r12d,%k5 + shrb $4,%r12b + kmovb %r12d,%k6 + kmovb %r11d,%k7 + + vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1} + vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k2} + vpsubq .Lmask52x4(%rip),%ymm5,%ymm5{%k3} + vpsubq .Lmask52x4(%rip),%ymm6,%ymm6{%k4} + vpsubq .Lmask52x4(%rip),%ymm7,%ymm7{%k5} + vpsubq .Lmask52x4(%rip),%ymm8,%ymm8{%k6} + vpsubq .Lmask52x4(%rip),%ymm9,%ymm9{%k7} + + vpandq .Lmask52x4(%rip),%ymm3,%ymm3 + vpandq .Lmask52x4(%rip),%ymm4,%ymm4 + vpandq .Lmask52x4(%rip),%ymm5,%ymm5 + vpandq .Lmask52x4(%rip),%ymm6,%ymm6 + vpandq .Lmask52x4(%rip),%ymm7,%ymm7 + vpandq .Lmask52x4(%rip),%ymm8,%ymm8 + vpandq .Lmask52x4(%rip),%ymm9,%ymm9 + + shrb $4,%r11b + kmovb %r11d,%k1 + + vpsubq .Lmask52x4(%rip),%ymm10,%ymm10{%k1} + + vpandq .Lmask52x4(%rip),%ymm10,%ymm10 + + vpbroadcastq %r15,%ymm0 + vpblendd $3,%ymm0,%ymm11,%ymm11 + + + + vpsrlq $52,%ymm11,%ymm0 + vpsrlq $52,%ymm12,%ymm1 + vpsrlq $52,%ymm13,%ymm2 + vpsrlq $52,%ymm14,%ymm19 + vpsrlq $52,%ymm15,%ymm20 + vpsrlq $52,%ymm16,%ymm21 + vpsrlq $52,%ymm17,%ymm22 + vpsrlq $52,%ymm18,%ymm23 + + + valignq $3,%ymm22,%ymm23,%ymm23 + valignq $3,%ymm21,%ymm22,%ymm22 + valignq $3,%ymm20,%ymm21,%ymm21 + valignq $3,%ymm19,%ymm20,%ymm20 + valignq $3,%ymm2,%ymm19,%ymm19 + valignq $3,%ymm1,%ymm2,%ymm2 + valignq $3,%ymm0,%ymm1,%ymm1 + valignq $3,.Lzeros(%rip),%ymm0,%ymm0 + + + vpandq .Lmask52x4(%rip),%ymm11,%ymm11 + vpandq .Lmask52x4(%rip),%ymm12,%ymm12 + vpandq .Lmask52x4(%rip),%ymm13,%ymm13 + vpandq .Lmask52x4(%rip),%ymm14,%ymm14 + vpandq .Lmask52x4(%rip),%ymm15,%ymm15 + vpandq .Lmask52x4(%rip),%ymm16,%ymm16 + vpandq .Lmask52x4(%rip),%ymm17,%ymm17 + vpandq .Lmask52x4(%rip),%ymm18,%ymm18 + + + vpaddq %ymm0,%ymm11,%ymm11 + vpaddq %ymm1,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + vpaddq %ymm19,%ymm14,%ymm14 + vpaddq %ymm20,%ymm15,%ymm15 + vpaddq %ymm21,%ymm16,%ymm16 + vpaddq %ymm22,%ymm17,%ymm17 + vpaddq %ymm23,%ymm18,%ymm18 + + + + vpcmpuq $6,.Lmask52x4(%rip),%ymm11,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm12,%k2 + kmovb %k1,%r14d + kmovb %k2,%r13d + shlb $4,%r13b + orb %r13b,%r14b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm13,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm14,%k2 + kmovb %k1,%r13d + kmovb %k2,%r12d + shlb $4,%r12b + orb %r12b,%r13b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm15,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2 + kmovb %k1,%r12d + kmovb %k2,%r11d + shlb $4,%r11b + orb %r11b,%r12b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k2 + kmovb %k1,%r11d + kmovb %k2,%r10d + shlb $4,%r10b + orb %r10b,%r11b + + addb %r14b,%r14b + adcb %r13b,%r13b + adcb %r12b,%r12b + adcb %r11b,%r11b + + + vpcmpuq $0,.Lmask52x4(%rip),%ymm11,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm12,%k2 + kmovb %k1,%r9d + kmovb %k2,%r8d + shlb $4,%r8b + orb %r8b,%r9b + + vpcmpuq $0,.Lmask52x4(%rip),%ymm13,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm14,%k2 + kmovb %k1,%r8d + kmovb %k2,%edx + shlb $4,%dl + orb %dl,%r8b + + vpcmpuq $0,.Lmask52x4(%rip),%ymm15,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2 + kmovb %k1,%edx + kmovb %k2,%ecx + shlb $4,%cl + orb %cl,%dl + + vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k2 + kmovb %k1,%ecx + kmovb %k2,%ebx + shlb $4,%bl + orb %bl,%cl + + addb %r9b,%r14b + adcb %r8b,%r13b + adcb %dl,%r12b + adcb %cl,%r11b + + xorb %r9b,%r14b + xorb %r8b,%r13b + xorb %dl,%r12b + xorb %cl,%r11b + + kmovb %r14d,%k1 + shrb $4,%r14b + kmovb %r14d,%k2 + kmovb %r13d,%k3 + shrb $4,%r13b + kmovb %r13d,%k4 + kmovb %r12d,%k5 + shrb $4,%r12b + kmovb %r12d,%k6 + kmovb %r11d,%k7 + + vpsubq .Lmask52x4(%rip),%ymm11,%ymm11{%k1} + vpsubq .Lmask52x4(%rip),%ymm12,%ymm12{%k2} + vpsubq .Lmask52x4(%rip),%ymm13,%ymm13{%k3} + vpsubq .Lmask52x4(%rip),%ymm14,%ymm14{%k4} + vpsubq .Lmask52x4(%rip),%ymm15,%ymm15{%k5} + vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k6} + vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k7} + + vpandq .Lmask52x4(%rip),%ymm11,%ymm11 + vpandq .Lmask52x4(%rip),%ymm12,%ymm12 + vpandq .Lmask52x4(%rip),%ymm13,%ymm13 + vpandq .Lmask52x4(%rip),%ymm14,%ymm14 + vpandq .Lmask52x4(%rip),%ymm15,%ymm15 + vpandq .Lmask52x4(%rip),%ymm16,%ymm16 + vpandq .Lmask52x4(%rip),%ymm17,%ymm17 + + shrb $4,%r11b + kmovb %r11d,%k1 + + vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k1} + + vpandq .Lmask52x4(%rip),%ymm18,%ymm18 + + vmovdqu64 %ymm3,0(%rdi) + vmovdqu64 %ymm4,32(%rdi) + vmovdqu64 %ymm5,64(%rdi) + vmovdqu64 %ymm6,96(%rdi) + vmovdqu64 %ymm7,128(%rdi) + vmovdqu64 %ymm8,160(%rdi) + vmovdqu64 %ymm9,192(%rdi) + vmovdqu64 %ymm10,224(%rdi) + + vmovdqu64 %ymm11,256(%rdi) + vmovdqu64 %ymm12,288(%rdi) + vmovdqu64 %ymm13,320(%rdi) + vmovdqu64 %ymm14,352(%rdi) + vmovdqu64 %ymm15,384(%rdi) + vmovdqu64 %ymm16,416(%rdi) + vmovdqu64 %ymm17,448(%rdi) + vmovdqu64 %ymm18,480(%rdi) + + vzeroupper + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + movq 0(%rax),%r15 +.cfi_restore %r15 + movq 8(%rax),%r14 +.cfi_restore %r14 + movq 16(%rax),%r13 +.cfi_restore %r13 + movq 24(%rax),%r12 +.cfi_restore %r12 + movq 32(%rax),%rbp +.cfi_restore %rbp + movq 40(%rax),%rbx +.cfi_restore %rbx + leaq 48(%rax),%rsp +.cfi_def_cfa %rsp,8 +.Lossl_rsaz_amm52x30_x2_ifma256_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_rsaz_amm52x30_x2_ifma256, .-ossl_rsaz_amm52x30_x2_ifma256 +.text + +.align 32 +.globl ossl_extract_multiplier_2x30_win5 +.type ossl_extract_multiplier_2x30_win5,@function +ossl_extract_multiplier_2x30_win5: +.cfi_startproc +.byte 243,15,30,250 + vmovdqa64 .Lones(%rip),%ymm30 + vpbroadcastq %rdx,%ymm28 + vpbroadcastq %rcx,%ymm29 + leaq 16384(%rsi),%rax + + + vpxor %xmm0,%xmm0,%xmm0 + vmovdqa64 %ymm0,%ymm27 + vmovdqa64 %ymm0,%ymm1 + vmovdqa64 %ymm0,%ymm2 + vmovdqa64 %ymm0,%ymm3 + vmovdqa64 %ymm0,%ymm4 + vmovdqa64 %ymm0,%ymm5 + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm0,%ymm17 + vmovdqa64 %ymm0,%ymm18 + vmovdqa64 %ymm0,%ymm19 + vmovdqa64 %ymm0,%ymm20 + vmovdqa64 %ymm0,%ymm21 + vmovdqa64 %ymm0,%ymm22 + vmovdqa64 %ymm0,%ymm23 + vmovdqa64 %ymm0,%ymm24 + vmovdqa64 %ymm0,%ymm25 + +.align 32 +.Lloop: + vpcmpq $0,%ymm27,%ymm28,%k1 + vpcmpq $0,%ymm27,%ymm29,%k2 + vmovdqu64 0(%rsi),%ymm26 + vpblendmq %ymm26,%ymm0,%ymm0{%k1} + vmovdqu64 32(%rsi),%ymm26 + vpblendmq %ymm26,%ymm1,%ymm1{%k1} + vmovdqu64 64(%rsi),%ymm26 + vpblendmq %ymm26,%ymm2,%ymm2{%k1} + vmovdqu64 96(%rsi),%ymm26 + vpblendmq %ymm26,%ymm3,%ymm3{%k1} + vmovdqu64 128(%rsi),%ymm26 + vpblendmq %ymm26,%ymm4,%ymm4{%k1} + vmovdqu64 160(%rsi),%ymm26 + vpblendmq %ymm26,%ymm5,%ymm5{%k1} + vmovdqu64 192(%rsi),%ymm26 + vpblendmq %ymm26,%ymm16,%ymm16{%k1} + vmovdqu64 224(%rsi),%ymm26 + vpblendmq %ymm26,%ymm17,%ymm17{%k1} + vmovdqu64 256(%rsi),%ymm26 + vpblendmq %ymm26,%ymm18,%ymm18{%k2} + vmovdqu64 288(%rsi),%ymm26 + vpblendmq %ymm26,%ymm19,%ymm19{%k2} + vmovdqu64 320(%rsi),%ymm26 + vpblendmq %ymm26,%ymm20,%ymm20{%k2} + vmovdqu64 352(%rsi),%ymm26 + vpblendmq %ymm26,%ymm21,%ymm21{%k2} + vmovdqu64 384(%rsi),%ymm26 + vpblendmq %ymm26,%ymm22,%ymm22{%k2} + vmovdqu64 416(%rsi),%ymm26 + vpblendmq %ymm26,%ymm23,%ymm23{%k2} + vmovdqu64 448(%rsi),%ymm26 + vpblendmq %ymm26,%ymm24,%ymm24{%k2} + vmovdqu64 480(%rsi),%ymm26 + vpblendmq %ymm26,%ymm25,%ymm25{%k2} + vpaddq %ymm30,%ymm27,%ymm27 + addq $512,%rsi + cmpq %rsi,%rax + jne .Lloop + vmovdqu64 %ymm0,0(%rdi) + vmovdqu64 %ymm1,32(%rdi) + vmovdqu64 %ymm2,64(%rdi) + vmovdqu64 %ymm3,96(%rdi) + vmovdqu64 %ymm4,128(%rdi) + vmovdqu64 %ymm5,160(%rdi) + vmovdqu64 %ymm16,192(%rdi) + vmovdqu64 %ymm17,224(%rdi) + vmovdqu64 %ymm18,256(%rdi) + vmovdqu64 %ymm19,288(%rdi) + vmovdqu64 %ymm20,320(%rdi) + vmovdqu64 %ymm21,352(%rdi) + vmovdqu64 %ymm22,384(%rdi) + vmovdqu64 %ymm23,416(%rdi) + vmovdqu64 %ymm24,448(%rdi) + vmovdqu64 %ymm25,480(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_extract_multiplier_2x30_win5, .-ossl_extract_multiplier_2x30_win5 +.section .rodata +.align 32 +.Lones: +.quad 1,1,1,1 +.Lzeros: +.quad 0,0,0,0 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-4k-avx512.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-4k-avx512.s new file mode 100644 index 000000000000..0f78a0294047 --- /dev/null +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-4k-avx512.s @@ -0,0 +1,1374 @@ +.text + +.globl ossl_rsaz_amm52x40_x1_ifma256 +.type ossl_rsaz_amm52x40_x1_ifma256,@function +.align 32 +ossl_rsaz_amm52x40_x1_ifma256: +.cfi_startproc +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + vpxord %ymm0,%ymm0,%ymm0 + vmovdqa64 %ymm0,%ymm3 + vmovdqa64 %ymm0,%ymm4 + vmovdqa64 %ymm0,%ymm5 + vmovdqa64 %ymm0,%ymm6 + vmovdqa64 %ymm0,%ymm7 + vmovdqa64 %ymm0,%ymm8 + vmovdqa64 %ymm0,%ymm9 + vmovdqa64 %ymm0,%ymm10 + vmovdqa64 %ymm0,%ymm11 + vmovdqa64 %ymm0,%ymm12 + + xorl %r9d,%r9d + + movq %rdx,%r11 + movq $0xfffffffffffff,%rax + + + movl $10,%ebx + +.align 32 +.Lloop10: + movq 0(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm4 + vpmadd52luq 64(%rsi),%ymm1,%ymm5 + vpmadd52luq 96(%rsi),%ymm1,%ymm6 + vpmadd52luq 128(%rsi),%ymm1,%ymm7 + vpmadd52luq 160(%rsi),%ymm1,%ymm8 + vpmadd52luq 192(%rsi),%ymm1,%ymm9 + vpmadd52luq 224(%rsi),%ymm1,%ymm10 + vpmadd52luq 256(%rsi),%ymm1,%ymm11 + vpmadd52luq 288(%rsi),%ymm1,%ymm12 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm4 + vpmadd52luq 64(%rcx),%ymm2,%ymm5 + vpmadd52luq 96(%rcx),%ymm2,%ymm6 + vpmadd52luq 128(%rcx),%ymm2,%ymm7 + vpmadd52luq 160(%rcx),%ymm2,%ymm8 + vpmadd52luq 192(%rcx),%ymm2,%ymm9 + vpmadd52luq 224(%rcx),%ymm2,%ymm10 + vpmadd52luq 256(%rcx),%ymm2,%ymm11 + vpmadd52luq 288(%rcx),%ymm2,%ymm12 + + + valignq $1,%ymm3,%ymm4,%ymm3 + valignq $1,%ymm4,%ymm5,%ymm4 + valignq $1,%ymm5,%ymm6,%ymm5 + valignq $1,%ymm6,%ymm7,%ymm6 + valignq $1,%ymm7,%ymm8,%ymm7 + valignq $1,%ymm8,%ymm9,%ymm8 + valignq $1,%ymm9,%ymm10,%ymm9 + valignq $1,%ymm10,%ymm11,%ymm10 + valignq $1,%ymm11,%ymm12,%ymm11 + valignq $1,%ymm12,%ymm0,%ymm12 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm4 + vpmadd52huq 64(%rsi),%ymm1,%ymm5 + vpmadd52huq 96(%rsi),%ymm1,%ymm6 + vpmadd52huq 128(%rsi),%ymm1,%ymm7 + vpmadd52huq 160(%rsi),%ymm1,%ymm8 + vpmadd52huq 192(%rsi),%ymm1,%ymm9 + vpmadd52huq 224(%rsi),%ymm1,%ymm10 + vpmadd52huq 256(%rsi),%ymm1,%ymm11 + vpmadd52huq 288(%rsi),%ymm1,%ymm12 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm4 + vpmadd52huq 64(%rcx),%ymm2,%ymm5 + vpmadd52huq 96(%rcx),%ymm2,%ymm6 + vpmadd52huq 128(%rcx),%ymm2,%ymm7 + vpmadd52huq 160(%rcx),%ymm2,%ymm8 + vpmadd52huq 192(%rcx),%ymm2,%ymm9 + vpmadd52huq 224(%rcx),%ymm2,%ymm10 + vpmadd52huq 256(%rcx),%ymm2,%ymm11 + vpmadd52huq 288(%rcx),%ymm2,%ymm12 + movq 8(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm4 + vpmadd52luq 64(%rsi),%ymm1,%ymm5 + vpmadd52luq 96(%rsi),%ymm1,%ymm6 + vpmadd52luq 128(%rsi),%ymm1,%ymm7 + vpmadd52luq 160(%rsi),%ymm1,%ymm8 + vpmadd52luq 192(%rsi),%ymm1,%ymm9 + vpmadd52luq 224(%rsi),%ymm1,%ymm10 + vpmadd52luq 256(%rsi),%ymm1,%ymm11 + vpmadd52luq 288(%rsi),%ymm1,%ymm12 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm4 + vpmadd52luq 64(%rcx),%ymm2,%ymm5 + vpmadd52luq 96(%rcx),%ymm2,%ymm6 + vpmadd52luq 128(%rcx),%ymm2,%ymm7 + vpmadd52luq 160(%rcx),%ymm2,%ymm8 + vpmadd52luq 192(%rcx),%ymm2,%ymm9 + vpmadd52luq 224(%rcx),%ymm2,%ymm10 + vpmadd52luq 256(%rcx),%ymm2,%ymm11 + vpmadd52luq 288(%rcx),%ymm2,%ymm12 + + + valignq $1,%ymm3,%ymm4,%ymm3 + valignq $1,%ymm4,%ymm5,%ymm4 + valignq $1,%ymm5,%ymm6,%ymm5 + valignq $1,%ymm6,%ymm7,%ymm6 + valignq $1,%ymm7,%ymm8,%ymm7 + valignq $1,%ymm8,%ymm9,%ymm8 + valignq $1,%ymm9,%ymm10,%ymm9 + valignq $1,%ymm10,%ymm11,%ymm10 + valignq $1,%ymm11,%ymm12,%ymm11 + valignq $1,%ymm12,%ymm0,%ymm12 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm4 + vpmadd52huq 64(%rsi),%ymm1,%ymm5 + vpmadd52huq 96(%rsi),%ymm1,%ymm6 + vpmadd52huq 128(%rsi),%ymm1,%ymm7 + vpmadd52huq 160(%rsi),%ymm1,%ymm8 + vpmadd52huq 192(%rsi),%ymm1,%ymm9 + vpmadd52huq 224(%rsi),%ymm1,%ymm10 + vpmadd52huq 256(%rsi),%ymm1,%ymm11 + vpmadd52huq 288(%rsi),%ymm1,%ymm12 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm4 + vpmadd52huq 64(%rcx),%ymm2,%ymm5 + vpmadd52huq 96(%rcx),%ymm2,%ymm6 + vpmadd52huq 128(%rcx),%ymm2,%ymm7 + vpmadd52huq 160(%rcx),%ymm2,%ymm8 + vpmadd52huq 192(%rcx),%ymm2,%ymm9 + vpmadd52huq 224(%rcx),%ymm2,%ymm10 + vpmadd52huq 256(%rcx),%ymm2,%ymm11 + vpmadd52huq 288(%rcx),%ymm2,%ymm12 + movq 16(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm4 + vpmadd52luq 64(%rsi),%ymm1,%ymm5 + vpmadd52luq 96(%rsi),%ymm1,%ymm6 + vpmadd52luq 128(%rsi),%ymm1,%ymm7 + vpmadd52luq 160(%rsi),%ymm1,%ymm8 + vpmadd52luq 192(%rsi),%ymm1,%ymm9 + vpmadd52luq 224(%rsi),%ymm1,%ymm10 + vpmadd52luq 256(%rsi),%ymm1,%ymm11 + vpmadd52luq 288(%rsi),%ymm1,%ymm12 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm4 + vpmadd52luq 64(%rcx),%ymm2,%ymm5 + vpmadd52luq 96(%rcx),%ymm2,%ymm6 + vpmadd52luq 128(%rcx),%ymm2,%ymm7 + vpmadd52luq 160(%rcx),%ymm2,%ymm8 + vpmadd52luq 192(%rcx),%ymm2,%ymm9 + vpmadd52luq 224(%rcx),%ymm2,%ymm10 + vpmadd52luq 256(%rcx),%ymm2,%ymm11 + vpmadd52luq 288(%rcx),%ymm2,%ymm12 + + + valignq $1,%ymm3,%ymm4,%ymm3 + valignq $1,%ymm4,%ymm5,%ymm4 + valignq $1,%ymm5,%ymm6,%ymm5 + valignq $1,%ymm6,%ymm7,%ymm6 + valignq $1,%ymm7,%ymm8,%ymm7 + valignq $1,%ymm8,%ymm9,%ymm8 + valignq $1,%ymm9,%ymm10,%ymm9 + valignq $1,%ymm10,%ymm11,%ymm10 + valignq $1,%ymm11,%ymm12,%ymm11 + valignq $1,%ymm12,%ymm0,%ymm12 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm4 + vpmadd52huq 64(%rsi),%ymm1,%ymm5 + vpmadd52huq 96(%rsi),%ymm1,%ymm6 + vpmadd52huq 128(%rsi),%ymm1,%ymm7 + vpmadd52huq 160(%rsi),%ymm1,%ymm8 + vpmadd52huq 192(%rsi),%ymm1,%ymm9 + vpmadd52huq 224(%rsi),%ymm1,%ymm10 + vpmadd52huq 256(%rsi),%ymm1,%ymm11 + vpmadd52huq 288(%rsi),%ymm1,%ymm12 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm4 + vpmadd52huq 64(%rcx),%ymm2,%ymm5 + vpmadd52huq 96(%rcx),%ymm2,%ymm6 + vpmadd52huq 128(%rcx),%ymm2,%ymm7 + vpmadd52huq 160(%rcx),%ymm2,%ymm8 + vpmadd52huq 192(%rcx),%ymm2,%ymm9 + vpmadd52huq 224(%rcx),%ymm2,%ymm10 + vpmadd52huq 256(%rcx),%ymm2,%ymm11 + vpmadd52huq 288(%rcx),%ymm2,%ymm12 + movq 24(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm4 + vpmadd52luq 64(%rsi),%ymm1,%ymm5 + vpmadd52luq 96(%rsi),%ymm1,%ymm6 + vpmadd52luq 128(%rsi),%ymm1,%ymm7 + vpmadd52luq 160(%rsi),%ymm1,%ymm8 + vpmadd52luq 192(%rsi),%ymm1,%ymm9 + vpmadd52luq 224(%rsi),%ymm1,%ymm10 + vpmadd52luq 256(%rsi),%ymm1,%ymm11 + vpmadd52luq 288(%rsi),%ymm1,%ymm12 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm4 + vpmadd52luq 64(%rcx),%ymm2,%ymm5 + vpmadd52luq 96(%rcx),%ymm2,%ymm6 + vpmadd52luq 128(%rcx),%ymm2,%ymm7 + vpmadd52luq 160(%rcx),%ymm2,%ymm8 + vpmadd52luq 192(%rcx),%ymm2,%ymm9 + vpmadd52luq 224(%rcx),%ymm2,%ymm10 + vpmadd52luq 256(%rcx),%ymm2,%ymm11 + vpmadd52luq 288(%rcx),%ymm2,%ymm12 + + + valignq $1,%ymm3,%ymm4,%ymm3 + valignq $1,%ymm4,%ymm5,%ymm4 + valignq $1,%ymm5,%ymm6,%ymm5 + valignq $1,%ymm6,%ymm7,%ymm6 + valignq $1,%ymm7,%ymm8,%ymm7 + valignq $1,%ymm8,%ymm9,%ymm8 + valignq $1,%ymm9,%ymm10,%ymm9 + valignq $1,%ymm10,%ymm11,%ymm10 + valignq $1,%ymm11,%ymm12,%ymm11 + valignq $1,%ymm12,%ymm0,%ymm12 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm4 + vpmadd52huq 64(%rsi),%ymm1,%ymm5 + vpmadd52huq 96(%rsi),%ymm1,%ymm6 + vpmadd52huq 128(%rsi),%ymm1,%ymm7 + vpmadd52huq 160(%rsi),%ymm1,%ymm8 + vpmadd52huq 192(%rsi),%ymm1,%ymm9 + vpmadd52huq 224(%rsi),%ymm1,%ymm10 + vpmadd52huq 256(%rsi),%ymm1,%ymm11 + vpmadd52huq 288(%rsi),%ymm1,%ymm12 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm4 + vpmadd52huq 64(%rcx),%ymm2,%ymm5 + vpmadd52huq 96(%rcx),%ymm2,%ymm6 + vpmadd52huq 128(%rcx),%ymm2,%ymm7 + vpmadd52huq 160(%rcx),%ymm2,%ymm8 + vpmadd52huq 192(%rcx),%ymm2,%ymm9 + vpmadd52huq 224(%rcx),%ymm2,%ymm10 + vpmadd52huq 256(%rcx),%ymm2,%ymm11 + vpmadd52huq 288(%rcx),%ymm2,%ymm12 + leaq 32(%r11),%r11 + decl %ebx + jne .Lloop10 + + vpbroadcastq %r9,%ymm0 + vpblendd $3,%ymm0,%ymm3,%ymm3 + + + + vpsrlq $52,%ymm3,%ymm0 + vpsrlq $52,%ymm4,%ymm1 + vpsrlq $52,%ymm5,%ymm2 + vpsrlq $52,%ymm6,%ymm23 + vpsrlq $52,%ymm7,%ymm24 + vpsrlq $52,%ymm8,%ymm25 + vpsrlq $52,%ymm9,%ymm26 + vpsrlq $52,%ymm10,%ymm27 + vpsrlq $52,%ymm11,%ymm28 + vpsrlq $52,%ymm12,%ymm29 + + + valignq $3,%ymm28,%ymm29,%ymm29 + valignq $3,%ymm27,%ymm28,%ymm28 + valignq $3,%ymm26,%ymm27,%ymm27 + valignq $3,%ymm25,%ymm26,%ymm26 + valignq $3,%ymm24,%ymm25,%ymm25 + valignq $3,%ymm23,%ymm24,%ymm24 + valignq $3,%ymm2,%ymm23,%ymm23 + valignq $3,%ymm1,%ymm2,%ymm2 + valignq $3,%ymm0,%ymm1,%ymm1 + valignq $3,.Lzeros(%rip),%ymm0,%ymm0 + + + vpandq .Lmask52x4(%rip),%ymm3,%ymm3 + vpandq .Lmask52x4(%rip),%ymm4,%ymm4 + vpandq .Lmask52x4(%rip),%ymm5,%ymm5 + vpandq .Lmask52x4(%rip),%ymm6,%ymm6 + vpandq .Lmask52x4(%rip),%ymm7,%ymm7 + vpandq .Lmask52x4(%rip),%ymm8,%ymm8 + vpandq .Lmask52x4(%rip),%ymm9,%ymm9 + vpandq .Lmask52x4(%rip),%ymm10,%ymm10 + vpandq .Lmask52x4(%rip),%ymm11,%ymm11 + vpandq .Lmask52x4(%rip),%ymm12,%ymm12 + + + vpaddq %ymm0,%ymm3,%ymm3 + vpaddq %ymm1,%ymm4,%ymm4 + vpaddq %ymm2,%ymm5,%ymm5 + vpaddq %ymm23,%ymm6,%ymm6 + vpaddq %ymm24,%ymm7,%ymm7 + vpaddq %ymm25,%ymm8,%ymm8 + vpaddq %ymm26,%ymm9,%ymm9 + vpaddq %ymm27,%ymm10,%ymm10 + vpaddq %ymm28,%ymm11,%ymm11 + vpaddq %ymm29,%ymm12,%ymm12 + + + + vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k2 + kmovb %k1,%r14d + kmovb %k2,%r13d + shlb $4,%r13b + orb %r13b,%r14b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm5,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm6,%k2 + kmovb %k1,%r13d + kmovb %k2,%r12d + shlb $4,%r12b + orb %r12b,%r13b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm7,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm8,%k2 + kmovb %k1,%r12d + kmovb %k2,%r11d + shlb $4,%r11b + orb %r11b,%r12b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm9,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm10,%k2 + kmovb %k1,%r11d + kmovb %k2,%r10d + shlb $4,%r10b + orb %r10b,%r11b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm11,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm12,%k2 + kmovb %k1,%r10d + kmovb %k2,%r9d + shlb $4,%r9b + orb %r9b,%r10b + + addb %r14b,%r14b + adcb %r13b,%r13b + adcb %r12b,%r12b + adcb %r11b,%r11b + adcb %r10b,%r10b + + + vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k2 + kmovb %k1,%r9d + kmovb %k2,%r8d + shlb $4,%r8b + orb %r8b,%r9b + + vpcmpuq $0,.Lmask52x4(%rip),%ymm5,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm6,%k2 + kmovb %k1,%r8d + kmovb %k2,%edx + shlb $4,%dl + orb %dl,%r8b + + vpcmpuq $0,.Lmask52x4(%rip),%ymm7,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm8,%k2 + kmovb %k1,%edx + kmovb %k2,%ecx + shlb $4,%cl + orb %cl,%dl + + vpcmpuq $0,.Lmask52x4(%rip),%ymm9,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm10,%k2 + kmovb %k1,%ecx + kmovb %k2,%ebx + shlb $4,%bl + orb %bl,%cl + + vpcmpuq $0,.Lmask52x4(%rip),%ymm11,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm12,%k2 + kmovb %k1,%ebx + kmovb %k2,%eax + shlb $4,%al + orb %al,%bl + + addb %r9b,%r14b + adcb %r8b,%r13b + adcb %dl,%r12b + adcb %cl,%r11b + adcb %bl,%r10b + + xorb %r9b,%r14b + xorb %r8b,%r13b + xorb %dl,%r12b + xorb %cl,%r11b + xorb %bl,%r10b + + kmovb %r14d,%k1 + shrb $4,%r14b + kmovb %r14d,%k2 + kmovb %r13d,%k3 + shrb $4,%r13b + kmovb %r13d,%k4 + kmovb %r12d,%k5 + shrb $4,%r12b + kmovb %r12d,%k6 + kmovb %r11d,%k7 + + vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1} + vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k2} + vpsubq .Lmask52x4(%rip),%ymm5,%ymm5{%k3} + vpsubq .Lmask52x4(%rip),%ymm6,%ymm6{%k4} + vpsubq .Lmask52x4(%rip),%ymm7,%ymm7{%k5} + vpsubq .Lmask52x4(%rip),%ymm8,%ymm8{%k6} + vpsubq .Lmask52x4(%rip),%ymm9,%ymm9{%k7} + + vpandq .Lmask52x4(%rip),%ymm3,%ymm3 + vpandq .Lmask52x4(%rip),%ymm4,%ymm4 + vpandq .Lmask52x4(%rip),%ymm5,%ymm5 + vpandq .Lmask52x4(%rip),%ymm6,%ymm6 + vpandq .Lmask52x4(%rip),%ymm7,%ymm7 + vpandq .Lmask52x4(%rip),%ymm8,%ymm8 + vpandq .Lmask52x4(%rip),%ymm9,%ymm9 + + shrb $4,%r11b + kmovb %r11d,%k1 + kmovb %r10d,%k2 + shrb $4,%r10b + kmovb %r10d,%k3 + + vpsubq .Lmask52x4(%rip),%ymm10,%ymm10{%k1} + vpsubq .Lmask52x4(%rip),%ymm11,%ymm11{%k2} + vpsubq .Lmask52x4(%rip),%ymm12,%ymm12{%k3} + + vpandq .Lmask52x4(%rip),%ymm10,%ymm10 + vpandq .Lmask52x4(%rip),%ymm11,%ymm11 + vpandq .Lmask52x4(%rip),%ymm12,%ymm12 + + vmovdqu64 %ymm3,0(%rdi) + vmovdqu64 %ymm4,32(%rdi) + vmovdqu64 %ymm5,64(%rdi) + vmovdqu64 %ymm6,96(%rdi) + vmovdqu64 %ymm7,128(%rdi) + vmovdqu64 %ymm8,160(%rdi) + vmovdqu64 %ymm9,192(%rdi) + vmovdqu64 %ymm10,224(%rdi) + vmovdqu64 %ymm11,256(%rdi) + vmovdqu64 %ymm12,288(%rdi) + + vzeroupper + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + movq 0(%rax),%r15 +.cfi_restore %r15 + movq 8(%rax),%r14 +.cfi_restore %r14 + movq 16(%rax),%r13 +.cfi_restore %r13 + movq 24(%rax),%r12 +.cfi_restore %r12 + movq 32(%rax),%rbp +.cfi_restore %rbp + movq 40(%rax),%rbx +.cfi_restore %rbx + leaq 48(%rax),%rsp +.cfi_def_cfa %rsp,8 +.Lossl_rsaz_amm52x40_x1_ifma256_epilogue: + + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_rsaz_amm52x40_x1_ifma256, .-ossl_rsaz_amm52x40_x1_ifma256 +.section .rodata +.align 32 +.Lmask52x4: +.quad 0xfffffffffffff +.quad 0xfffffffffffff +.quad 0xfffffffffffff +.quad 0xfffffffffffff +.text + +.globl ossl_rsaz_amm52x40_x2_ifma256 +.type ossl_rsaz_amm52x40_x2_ifma256,@function +.align 32 +ossl_rsaz_amm52x40_x2_ifma256: +.cfi_startproc +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + vpxord %ymm0,%ymm0,%ymm0 + vmovdqa64 %ymm0,%ymm3 + vmovdqa64 %ymm0,%ymm4 + vmovdqa64 %ymm0,%ymm5 + vmovdqa64 %ymm0,%ymm6 + vmovdqa64 %ymm0,%ymm7 + vmovdqa64 %ymm0,%ymm8 + vmovdqa64 %ymm0,%ymm9 + vmovdqa64 %ymm0,%ymm10 + vmovdqa64 %ymm0,%ymm11 + vmovdqa64 %ymm0,%ymm12 + + vmovdqa64 %ymm0,%ymm13 + vmovdqa64 %ymm0,%ymm14 + vmovdqa64 %ymm0,%ymm15 + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm0,%ymm17 + vmovdqa64 %ymm0,%ymm18 + vmovdqa64 %ymm0,%ymm19 + vmovdqa64 %ymm0,%ymm20 + vmovdqa64 %ymm0,%ymm21 + vmovdqa64 %ymm0,%ymm22 + + + xorl %r9d,%r9d + xorl %r15d,%r15d + + movq %rdx,%r11 + movq $0xfffffffffffff,%rax + + movl $40,%ebx + +.align 32 +.Lloop40: + movq 0(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq (%r8),%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm4 + vpmadd52luq 64(%rsi),%ymm1,%ymm5 + vpmadd52luq 96(%rsi),%ymm1,%ymm6 + vpmadd52luq 128(%rsi),%ymm1,%ymm7 + vpmadd52luq 160(%rsi),%ymm1,%ymm8 + vpmadd52luq 192(%rsi),%ymm1,%ymm9 + vpmadd52luq 224(%rsi),%ymm1,%ymm10 + vpmadd52luq 256(%rsi),%ymm1,%ymm11 + vpmadd52luq 288(%rsi),%ymm1,%ymm12 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm4 + vpmadd52luq 64(%rcx),%ymm2,%ymm5 + vpmadd52luq 96(%rcx),%ymm2,%ymm6 + vpmadd52luq 128(%rcx),%ymm2,%ymm7 + vpmadd52luq 160(%rcx),%ymm2,%ymm8 + vpmadd52luq 192(%rcx),%ymm2,%ymm9 + vpmadd52luq 224(%rcx),%ymm2,%ymm10 + vpmadd52luq 256(%rcx),%ymm2,%ymm11 + vpmadd52luq 288(%rcx),%ymm2,%ymm12 + + + valignq $1,%ymm3,%ymm4,%ymm3 + valignq $1,%ymm4,%ymm5,%ymm4 + valignq $1,%ymm5,%ymm6,%ymm5 + valignq $1,%ymm6,%ymm7,%ymm6 + valignq $1,%ymm7,%ymm8,%ymm7 + valignq $1,%ymm8,%ymm9,%ymm8 + valignq $1,%ymm9,%ymm10,%ymm9 + valignq $1,%ymm10,%ymm11,%ymm10 + valignq $1,%ymm11,%ymm12,%ymm11 + valignq $1,%ymm12,%ymm0,%ymm12 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm4 + vpmadd52huq 64(%rsi),%ymm1,%ymm5 + vpmadd52huq 96(%rsi),%ymm1,%ymm6 + vpmadd52huq 128(%rsi),%ymm1,%ymm7 + vpmadd52huq 160(%rsi),%ymm1,%ymm8 + vpmadd52huq 192(%rsi),%ymm1,%ymm9 + vpmadd52huq 224(%rsi),%ymm1,%ymm10 + vpmadd52huq 256(%rsi),%ymm1,%ymm11 + vpmadd52huq 288(%rsi),%ymm1,%ymm12 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm4 + vpmadd52huq 64(%rcx),%ymm2,%ymm5 + vpmadd52huq 96(%rcx),%ymm2,%ymm6 + vpmadd52huq 128(%rcx),%ymm2,%ymm7 + vpmadd52huq 160(%rcx),%ymm2,%ymm8 + vpmadd52huq 192(%rcx),%ymm2,%ymm9 + vpmadd52huq 224(%rcx),%ymm2,%ymm10 + vpmadd52huq 256(%rcx),%ymm2,%ymm11 + vpmadd52huq 288(%rcx),%ymm2,%ymm12 + movq 320(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 320(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r15 + movq %r12,%r10 + adcq $0,%r10 + + movq 8(%r8),%r13 + imulq %r15,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 320(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r15 + adcq %r12,%r10 + + shrq $52,%r15 + salq $12,%r10 + orq %r10,%r15 + + vpmadd52luq 320(%rsi),%ymm1,%ymm13 + vpmadd52luq 352(%rsi),%ymm1,%ymm14 + vpmadd52luq 384(%rsi),%ymm1,%ymm15 + vpmadd52luq 416(%rsi),%ymm1,%ymm16 + vpmadd52luq 448(%rsi),%ymm1,%ymm17 + vpmadd52luq 480(%rsi),%ymm1,%ymm18 + vpmadd52luq 512(%rsi),%ymm1,%ymm19 + vpmadd52luq 544(%rsi),%ymm1,%ymm20 + vpmadd52luq 576(%rsi),%ymm1,%ymm21 + vpmadd52luq 608(%rsi),%ymm1,%ymm22 + + vpmadd52luq 320(%rcx),%ymm2,%ymm13 + vpmadd52luq 352(%rcx),%ymm2,%ymm14 + vpmadd52luq 384(%rcx),%ymm2,%ymm15 + vpmadd52luq 416(%rcx),%ymm2,%ymm16 + vpmadd52luq 448(%rcx),%ymm2,%ymm17 + vpmadd52luq 480(%rcx),%ymm2,%ymm18 + vpmadd52luq 512(%rcx),%ymm2,%ymm19 + vpmadd52luq 544(%rcx),%ymm2,%ymm20 + vpmadd52luq 576(%rcx),%ymm2,%ymm21 + vpmadd52luq 608(%rcx),%ymm2,%ymm22 + + + valignq $1,%ymm13,%ymm14,%ymm13 + valignq $1,%ymm14,%ymm15,%ymm14 + valignq $1,%ymm15,%ymm16,%ymm15 + valignq $1,%ymm16,%ymm17,%ymm16 + valignq $1,%ymm17,%ymm18,%ymm17 + valignq $1,%ymm18,%ymm19,%ymm18 + valignq $1,%ymm19,%ymm20,%ymm19 + valignq $1,%ymm20,%ymm21,%ymm20 + valignq $1,%ymm21,%ymm22,%ymm21 + valignq $1,%ymm22,%ymm0,%ymm22 + + vmovq %xmm13,%r13 + addq %r13,%r15 + + vpmadd52huq 320(%rsi),%ymm1,%ymm13 + vpmadd52huq 352(%rsi),%ymm1,%ymm14 + vpmadd52huq 384(%rsi),%ymm1,%ymm15 + vpmadd52huq 416(%rsi),%ymm1,%ymm16 + vpmadd52huq 448(%rsi),%ymm1,%ymm17 + vpmadd52huq 480(%rsi),%ymm1,%ymm18 + vpmadd52huq 512(%rsi),%ymm1,%ymm19 + vpmadd52huq 544(%rsi),%ymm1,%ymm20 + vpmadd52huq 576(%rsi),%ymm1,%ymm21 + vpmadd52huq 608(%rsi),%ymm1,%ymm22 + + vpmadd52huq 320(%rcx),%ymm2,%ymm13 + vpmadd52huq 352(%rcx),%ymm2,%ymm14 + vpmadd52huq 384(%rcx),%ymm2,%ymm15 + vpmadd52huq 416(%rcx),%ymm2,%ymm16 + vpmadd52huq 448(%rcx),%ymm2,%ymm17 + vpmadd52huq 480(%rcx),%ymm2,%ymm18 + vpmadd52huq 512(%rcx),%ymm2,%ymm19 + vpmadd52huq 544(%rcx),%ymm2,%ymm20 + vpmadd52huq 576(%rcx),%ymm2,%ymm21 + vpmadd52huq 608(%rcx),%ymm2,%ymm22 + leaq 8(%r11),%r11 + decl %ebx + jne .Lloop40 + + vpbroadcastq %r9,%ymm0 + vpblendd $3,%ymm0,%ymm3,%ymm3 + + + + vpsrlq $52,%ymm3,%ymm0 + vpsrlq $52,%ymm4,%ymm1 + vpsrlq $52,%ymm5,%ymm2 + vpsrlq $52,%ymm6,%ymm23 + vpsrlq $52,%ymm7,%ymm24 + vpsrlq $52,%ymm8,%ymm25 + vpsrlq $52,%ymm9,%ymm26 + vpsrlq $52,%ymm10,%ymm27 + vpsrlq $52,%ymm11,%ymm28 + vpsrlq $52,%ymm12,%ymm29 + + + valignq $3,%ymm28,%ymm29,%ymm29 + valignq $3,%ymm27,%ymm28,%ymm28 + valignq $3,%ymm26,%ymm27,%ymm27 + valignq $3,%ymm25,%ymm26,%ymm26 + valignq $3,%ymm24,%ymm25,%ymm25 + valignq $3,%ymm23,%ymm24,%ymm24 + valignq $3,%ymm2,%ymm23,%ymm23 + valignq $3,%ymm1,%ymm2,%ymm2 + valignq $3,%ymm0,%ymm1,%ymm1 + valignq $3,.Lzeros(%rip),%ymm0,%ymm0 + + + vpandq .Lmask52x4(%rip),%ymm3,%ymm3 + vpandq .Lmask52x4(%rip),%ymm4,%ymm4 + vpandq .Lmask52x4(%rip),%ymm5,%ymm5 + vpandq .Lmask52x4(%rip),%ymm6,%ymm6 + vpandq .Lmask52x4(%rip),%ymm7,%ymm7 + vpandq .Lmask52x4(%rip),%ymm8,%ymm8 + vpandq .Lmask52x4(%rip),%ymm9,%ymm9 + vpandq .Lmask52x4(%rip),%ymm10,%ymm10 + vpandq .Lmask52x4(%rip),%ymm11,%ymm11 + vpandq .Lmask52x4(%rip),%ymm12,%ymm12 + + + vpaddq %ymm0,%ymm3,%ymm3 + vpaddq %ymm1,%ymm4,%ymm4 + vpaddq %ymm2,%ymm5,%ymm5 + vpaddq %ymm23,%ymm6,%ymm6 + vpaddq %ymm24,%ymm7,%ymm7 + vpaddq %ymm25,%ymm8,%ymm8 + vpaddq %ymm26,%ymm9,%ymm9 + vpaddq %ymm27,%ymm10,%ymm10 + vpaddq %ymm28,%ymm11,%ymm11 + vpaddq %ymm29,%ymm12,%ymm12 + + + + vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k2 + kmovb %k1,%r14d + kmovb %k2,%r13d + shlb $4,%r13b + orb %r13b,%r14b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm5,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm6,%k2 + kmovb %k1,%r13d + kmovb %k2,%r12d + shlb $4,%r12b + orb %r12b,%r13b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm7,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm8,%k2 + kmovb %k1,%r12d + kmovb %k2,%r11d + shlb $4,%r11b + orb %r11b,%r12b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm9,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm10,%k2 + kmovb %k1,%r11d + kmovb %k2,%r10d + shlb $4,%r10b + orb %r10b,%r11b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm11,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm12,%k2 + kmovb %k1,%r10d + kmovb %k2,%r9d + shlb $4,%r9b + orb %r9b,%r10b + + addb %r14b,%r14b + adcb %r13b,%r13b + adcb %r12b,%r12b + adcb %r11b,%r11b + adcb %r10b,%r10b + + + vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k2 + kmovb %k1,%r9d + kmovb %k2,%r8d + shlb $4,%r8b + orb %r8b,%r9b + + vpcmpuq $0,.Lmask52x4(%rip),%ymm5,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm6,%k2 + kmovb %k1,%r8d + kmovb %k2,%edx + shlb $4,%dl + orb %dl,%r8b + + vpcmpuq $0,.Lmask52x4(%rip),%ymm7,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm8,%k2 + kmovb %k1,%edx + kmovb %k2,%ecx + shlb $4,%cl + orb %cl,%dl + + vpcmpuq $0,.Lmask52x4(%rip),%ymm9,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm10,%k2 + kmovb %k1,%ecx + kmovb %k2,%ebx + shlb $4,%bl + orb %bl,%cl + + vpcmpuq $0,.Lmask52x4(%rip),%ymm11,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm12,%k2 + kmovb %k1,%ebx + kmovb %k2,%eax + shlb $4,%al + orb %al,%bl + + addb %r9b,%r14b + adcb %r8b,%r13b + adcb %dl,%r12b + adcb %cl,%r11b + adcb %bl,%r10b + + xorb %r9b,%r14b + xorb %r8b,%r13b + xorb %dl,%r12b + xorb %cl,%r11b + xorb %bl,%r10b + + kmovb %r14d,%k1 + shrb $4,%r14b + kmovb %r14d,%k2 + kmovb %r13d,%k3 + shrb $4,%r13b + kmovb %r13d,%k4 + kmovb %r12d,%k5 + shrb $4,%r12b + kmovb %r12d,%k6 + kmovb %r11d,%k7 + + vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1} + vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k2} + vpsubq .Lmask52x4(%rip),%ymm5,%ymm5{%k3} + vpsubq .Lmask52x4(%rip),%ymm6,%ymm6{%k4} + vpsubq .Lmask52x4(%rip),%ymm7,%ymm7{%k5} + vpsubq .Lmask52x4(%rip),%ymm8,%ymm8{%k6} + vpsubq .Lmask52x4(%rip),%ymm9,%ymm9{%k7} + + vpandq .Lmask52x4(%rip),%ymm3,%ymm3 + vpandq .Lmask52x4(%rip),%ymm4,%ymm4 + vpandq .Lmask52x4(%rip),%ymm5,%ymm5 + vpandq .Lmask52x4(%rip),%ymm6,%ymm6 + vpandq .Lmask52x4(%rip),%ymm7,%ymm7 + vpandq .Lmask52x4(%rip),%ymm8,%ymm8 + vpandq .Lmask52x4(%rip),%ymm9,%ymm9 + + shrb $4,%r11b + kmovb %r11d,%k1 + kmovb %r10d,%k2 + shrb $4,%r10b + kmovb %r10d,%k3 + + vpsubq .Lmask52x4(%rip),%ymm10,%ymm10{%k1} + vpsubq .Lmask52x4(%rip),%ymm11,%ymm11{%k2} + vpsubq .Lmask52x4(%rip),%ymm12,%ymm12{%k3} + + vpandq .Lmask52x4(%rip),%ymm10,%ymm10 + vpandq .Lmask52x4(%rip),%ymm11,%ymm11 + vpandq .Lmask52x4(%rip),%ymm12,%ymm12 + + vpbroadcastq %r15,%ymm0 + vpblendd $3,%ymm0,%ymm13,%ymm13 + + + + vpsrlq $52,%ymm13,%ymm0 + vpsrlq $52,%ymm14,%ymm1 + vpsrlq $52,%ymm15,%ymm2 + vpsrlq $52,%ymm16,%ymm23 + vpsrlq $52,%ymm17,%ymm24 + vpsrlq $52,%ymm18,%ymm25 + vpsrlq $52,%ymm19,%ymm26 + vpsrlq $52,%ymm20,%ymm27 + vpsrlq $52,%ymm21,%ymm28 + vpsrlq $52,%ymm22,%ymm29 + + + valignq $3,%ymm28,%ymm29,%ymm29 + valignq $3,%ymm27,%ymm28,%ymm28 + valignq $3,%ymm26,%ymm27,%ymm27 + valignq $3,%ymm25,%ymm26,%ymm26 + valignq $3,%ymm24,%ymm25,%ymm25 + valignq $3,%ymm23,%ymm24,%ymm24 + valignq $3,%ymm2,%ymm23,%ymm23 + valignq $3,%ymm1,%ymm2,%ymm2 + valignq $3,%ymm0,%ymm1,%ymm1 + valignq $3,.Lzeros(%rip),%ymm0,%ymm0 + + + vpandq .Lmask52x4(%rip),%ymm13,%ymm13 + vpandq .Lmask52x4(%rip),%ymm14,%ymm14 + vpandq .Lmask52x4(%rip),%ymm15,%ymm15 + vpandq .Lmask52x4(%rip),%ymm16,%ymm16 + vpandq .Lmask52x4(%rip),%ymm17,%ymm17 + vpandq .Lmask52x4(%rip),%ymm18,%ymm18 + vpandq .Lmask52x4(%rip),%ymm19,%ymm19 + vpandq .Lmask52x4(%rip),%ymm20,%ymm20 + vpandq .Lmask52x4(%rip),%ymm21,%ymm21 + vpandq .Lmask52x4(%rip),%ymm22,%ymm22 + + + vpaddq %ymm0,%ymm13,%ymm13 + vpaddq %ymm1,%ymm14,%ymm14 + vpaddq %ymm2,%ymm15,%ymm15 + vpaddq %ymm23,%ymm16,%ymm16 + vpaddq %ymm24,%ymm17,%ymm17 + vpaddq %ymm25,%ymm18,%ymm18 + vpaddq %ymm26,%ymm19,%ymm19 + vpaddq %ymm27,%ymm20,%ymm20 + vpaddq %ymm28,%ymm21,%ymm21 + vpaddq %ymm29,%ymm22,%ymm22 + + + + vpcmpuq $6,.Lmask52x4(%rip),%ymm13,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm14,%k2 + kmovb %k1,%r14d + kmovb %k2,%r13d + shlb $4,%r13b + orb %r13b,%r14b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm15,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2 + kmovb %k1,%r13d + kmovb %k2,%r12d + shlb $4,%r12b + orb %r12b,%r13b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k2 + kmovb %k1,%r12d + kmovb %k2,%r11d + shlb $4,%r11b + orb %r11b,%r12b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm19,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm20,%k2 + kmovb %k1,%r11d + kmovb %k2,%r10d + shlb $4,%r10b + orb %r10b,%r11b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm21,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm22,%k2 + kmovb %k1,%r10d + kmovb %k2,%r9d + shlb $4,%r9b + orb %r9b,%r10b + + addb %r14b,%r14b + adcb %r13b,%r13b + adcb %r12b,%r12b + adcb %r11b,%r11b + adcb %r10b,%r10b + + + vpcmpuq $0,.Lmask52x4(%rip),%ymm13,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm14,%k2 + kmovb %k1,%r9d + kmovb %k2,%r8d + shlb $4,%r8b + orb %r8b,%r9b + + vpcmpuq $0,.Lmask52x4(%rip),%ymm15,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2 + kmovb %k1,%r8d + kmovb %k2,%edx + shlb $4,%dl + orb %dl,%r8b + + vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k2 + kmovb %k1,%edx + kmovb %k2,%ecx + shlb $4,%cl + orb %cl,%dl + + vpcmpuq $0,.Lmask52x4(%rip),%ymm19,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm20,%k2 + kmovb %k1,%ecx + kmovb %k2,%ebx + shlb $4,%bl + orb %bl,%cl + + vpcmpuq $0,.Lmask52x4(%rip),%ymm21,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm22,%k2 + kmovb %k1,%ebx + kmovb %k2,%eax + shlb $4,%al + orb %al,%bl + + addb %r9b,%r14b + adcb %r8b,%r13b + adcb %dl,%r12b + adcb %cl,%r11b + adcb %bl,%r10b + + xorb %r9b,%r14b + xorb %r8b,%r13b + xorb %dl,%r12b + xorb %cl,%r11b + xorb %bl,%r10b + + kmovb %r14d,%k1 + shrb $4,%r14b + kmovb %r14d,%k2 + kmovb %r13d,%k3 + shrb $4,%r13b + kmovb %r13d,%k4 + kmovb %r12d,%k5 + shrb $4,%r12b + kmovb %r12d,%k6 + kmovb %r11d,%k7 + + vpsubq .Lmask52x4(%rip),%ymm13,%ymm13{%k1} + vpsubq .Lmask52x4(%rip),%ymm14,%ymm14{%k2} + vpsubq .Lmask52x4(%rip),%ymm15,%ymm15{%k3} + vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k4} + vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k5} + vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k6} + vpsubq .Lmask52x4(%rip),%ymm19,%ymm19{%k7} + + vpandq .Lmask52x4(%rip),%ymm13,%ymm13 + vpandq .Lmask52x4(%rip),%ymm14,%ymm14 + vpandq .Lmask52x4(%rip),%ymm15,%ymm15 + vpandq .Lmask52x4(%rip),%ymm16,%ymm16 + vpandq .Lmask52x4(%rip),%ymm17,%ymm17 + vpandq .Lmask52x4(%rip),%ymm18,%ymm18 + vpandq .Lmask52x4(%rip),%ymm19,%ymm19 + + shrb $4,%r11b + kmovb %r11d,%k1 + kmovb %r10d,%k2 + shrb $4,%r10b + kmovb %r10d,%k3 + + vpsubq .Lmask52x4(%rip),%ymm20,%ymm20{%k1} + vpsubq .Lmask52x4(%rip),%ymm21,%ymm21{%k2} + vpsubq .Lmask52x4(%rip),%ymm22,%ymm22{%k3} + + vpandq .Lmask52x4(%rip),%ymm20,%ymm20 + vpandq .Lmask52x4(%rip),%ymm21,%ymm21 + vpandq .Lmask52x4(%rip),%ymm22,%ymm22 + + vmovdqu64 %ymm3,0(%rdi) + vmovdqu64 %ymm4,32(%rdi) + vmovdqu64 %ymm5,64(%rdi) + vmovdqu64 %ymm6,96(%rdi) + vmovdqu64 %ymm7,128(%rdi) + vmovdqu64 %ymm8,160(%rdi) + vmovdqu64 %ymm9,192(%rdi) + vmovdqu64 %ymm10,224(%rdi) + vmovdqu64 %ymm11,256(%rdi) + vmovdqu64 %ymm12,288(%rdi) + + vmovdqu64 %ymm13,320(%rdi) + vmovdqu64 %ymm14,352(%rdi) + vmovdqu64 %ymm15,384(%rdi) + vmovdqu64 %ymm16,416(%rdi) + vmovdqu64 %ymm17,448(%rdi) + vmovdqu64 %ymm18,480(%rdi) + vmovdqu64 %ymm19,512(%rdi) + vmovdqu64 %ymm20,544(%rdi) + vmovdqu64 %ymm21,576(%rdi) + vmovdqu64 %ymm22,608(%rdi) + + vzeroupper + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + movq 0(%rax),%r15 +.cfi_restore %r15 + movq 8(%rax),%r14 +.cfi_restore %r14 + movq 16(%rax),%r13 +.cfi_restore %r13 + movq 24(%rax),%r12 +.cfi_restore %r12 + movq 32(%rax),%rbp +.cfi_restore %rbp + movq 40(%rax),%rbx +.cfi_restore %rbx + leaq 48(%rax),%rsp +.cfi_def_cfa %rsp,8 +.Lossl_rsaz_amm52x40_x2_ifma256_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_rsaz_amm52x40_x2_ifma256, .-ossl_rsaz_amm52x40_x2_ifma256 +.text + +.align 32 +.globl ossl_extract_multiplier_2x40_win5 +.type ossl_extract_multiplier_2x40_win5,@function +ossl_extract_multiplier_2x40_win5: +.cfi_startproc +.byte 243,15,30,250 + vmovdqa64 .Lones(%rip),%ymm24 + vpbroadcastq %rdx,%ymm22 + vpbroadcastq %rcx,%ymm23 + leaq 20480(%rsi),%rax + + + movq %rsi,%r10 + + + vpxor %xmm0,%xmm0,%xmm0 + vmovdqa64 %ymm0,%ymm1 + vmovdqa64 %ymm0,%ymm2 + vmovdqa64 %ymm0,%ymm3 + vmovdqa64 %ymm0,%ymm4 + vmovdqa64 %ymm0,%ymm5 + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm0,%ymm17 + vmovdqa64 %ymm0,%ymm18 + vmovdqa64 %ymm0,%ymm19 + vpxorq %ymm21,%ymm21,%ymm21 +.align 32 +.Lloop_0: + vpcmpq $0,%ymm21,%ymm22,%k1 + vmovdqu64 0(%rsi),%ymm20 + vpblendmq %ymm20,%ymm0,%ymm0{%k1} + vmovdqu64 32(%rsi),%ymm20 + vpblendmq %ymm20,%ymm1,%ymm1{%k1} + vmovdqu64 64(%rsi),%ymm20 + vpblendmq %ymm20,%ymm2,%ymm2{%k1} + vmovdqu64 96(%rsi),%ymm20 + vpblendmq %ymm20,%ymm3,%ymm3{%k1} + vmovdqu64 128(%rsi),%ymm20 + vpblendmq %ymm20,%ymm4,%ymm4{%k1} + vmovdqu64 160(%rsi),%ymm20 + vpblendmq %ymm20,%ymm5,%ymm5{%k1} + vmovdqu64 192(%rsi),%ymm20 + vpblendmq %ymm20,%ymm16,%ymm16{%k1} + vmovdqu64 224(%rsi),%ymm20 + vpblendmq %ymm20,%ymm17,%ymm17{%k1} + vmovdqu64 256(%rsi),%ymm20 + vpblendmq %ymm20,%ymm18,%ymm18{%k1} + vmovdqu64 288(%rsi),%ymm20 + vpblendmq %ymm20,%ymm19,%ymm19{%k1} + vpaddq %ymm24,%ymm21,%ymm21 + addq $640,%rsi + cmpq %rsi,%rax + jne .Lloop_0 + vmovdqu64 %ymm0,0(%rdi) + vmovdqu64 %ymm1,32(%rdi) + vmovdqu64 %ymm2,64(%rdi) + vmovdqu64 %ymm3,96(%rdi) + vmovdqu64 %ymm4,128(%rdi) + vmovdqu64 %ymm5,160(%rdi) + vmovdqu64 %ymm16,192(%rdi) + vmovdqu64 %ymm17,224(%rdi) + vmovdqu64 %ymm18,256(%rdi) + vmovdqu64 %ymm19,288(%rdi) + movq %r10,%rsi + vpxorq %ymm21,%ymm21,%ymm21 +.align 32 +.Lloop_320: + vpcmpq $0,%ymm21,%ymm23,%k1 + vmovdqu64 320(%rsi),%ymm20 + vpblendmq %ymm20,%ymm0,%ymm0{%k1} + vmovdqu64 352(%rsi),%ymm20 + vpblendmq %ymm20,%ymm1,%ymm1{%k1} + vmovdqu64 384(%rsi),%ymm20 + vpblendmq %ymm20,%ymm2,%ymm2{%k1} + vmovdqu64 416(%rsi),%ymm20 + vpblendmq %ymm20,%ymm3,%ymm3{%k1} + vmovdqu64 448(%rsi),%ymm20 + vpblendmq %ymm20,%ymm4,%ymm4{%k1} + vmovdqu64 480(%rsi),%ymm20 + vpblendmq %ymm20,%ymm5,%ymm5{%k1} + vmovdqu64 512(%rsi),%ymm20 + vpblendmq %ymm20,%ymm16,%ymm16{%k1} + vmovdqu64 544(%rsi),%ymm20 + vpblendmq %ymm20,%ymm17,%ymm17{%k1} + vmovdqu64 576(%rsi),%ymm20 + vpblendmq %ymm20,%ymm18,%ymm18{%k1} + vmovdqu64 608(%rsi),%ymm20 + vpblendmq %ymm20,%ymm19,%ymm19{%k1} + vpaddq %ymm24,%ymm21,%ymm21 + addq $640,%rsi + cmpq %rsi,%rax + jne .Lloop_320 + vmovdqu64 %ymm0,320(%rdi) + vmovdqu64 %ymm1,352(%rdi) + vmovdqu64 %ymm2,384(%rdi) + vmovdqu64 %ymm3,416(%rdi) + vmovdqu64 %ymm4,448(%rdi) + vmovdqu64 %ymm5,480(%rdi) + vmovdqu64 %ymm16,512(%rdi) + vmovdqu64 %ymm17,544(%rdi) + vmovdqu64 %ymm18,576(%rdi) + vmovdqu64 %ymm19,608(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_extract_multiplier_2x40_win5, .-ossl_extract_multiplier_2x40_win5 +.section .rodata +.align 32 +.Lones: +.quad 1,1,1,1 +.Lzeros: +.quad 0,0,0,0 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-avx2.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-avx2.s index 7644d07da74e..d7093b8f0395 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-avx2.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-avx2.s @@ -1730,6 +1730,7 @@ rsaz_avx2_eligible: .byte 0xf3,0xc3 .size rsaz_avx2_eligible,.-rsaz_avx2_eligible +.section .rodata .align 64 .Land_mask: .quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff @@ -1741,6 +1742,7 @@ rsaz_avx2_eligible: .long 0,0,0,0, 1,1,1,1 .long 2,2,2,2, 3,3,3,3 .long 4,4,4,4, 4,4,4,4 +.previous .align 64 .section ".note.gnu.property", "a" .p2align 3 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-x86_64.s index ea1ae3891193..2f1270f6e6b5 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-x86_64.s @@ -2009,10 +2009,12 @@ rsaz_512_gather4: .cfi_endproc .size rsaz_512_gather4,.-rsaz_512_gather4 +.section .rodata .align 64 .Linc: .long 0,0, 1,1 .long 2,2, 2,2 +.previous .section ".note.gnu.property", "a" .p2align 3 .long 1f - 0f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-mont5.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-mont5.s index d0025f94e2d7..1a4502db89c5 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-mont5.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-mont5.s @@ -3596,11 +3596,13 @@ bn_gather5: .LSEH_end_bn_gather5: .cfi_endproc .size bn_gather5,.-bn_gather5 +.section .rodata .align 64 .Linc: .long 0,0, 1,1 .long 2,2, 2,2 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.previous .section ".note.gnu.property", "a" .p2align 3 .long 1f - 0f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/ec/ecp_nistz256-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/ec/ecp_nistz256-x86_64.s index 4e05eefb1ee9..91b1e0c7ce64 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/ec/ecp_nistz256-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/ec/ecp_nistz256-x86_64.s @@ -1,4 +1,4 @@ -.text +.section .rodata .globl ecp_nistz256_precomputed .type ecp_nistz256_precomputed,@object .align 4096 @@ -2376,6 +2376,7 @@ ecp_nistz256_precomputed: +.section .rodata .align 64 .Lpoly: .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 @@ -2398,6 +2399,7 @@ ecp_nistz256_precomputed: .quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 .LordK: .quad 0xccd1c8aaee00bc4f +.previous .globl ecp_nistz256_mul_by_2 .type ecp_nistz256_mul_by_2,@function diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/aes-gcm-avx512.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/aes-gcm-avx512.s new file mode 100644 index 000000000000..0cd13c77c4af --- /dev/null +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/aes-gcm-avx512.s @@ -0,0 +1,136131 @@ + +.globl ossl_vaes_vpclmulqdq_capable +.type ossl_vaes_vpclmulqdq_capable,@function +.align 32 +ossl_vaes_vpclmulqdq_capable: + movq OPENSSL_ia32cap_P+8(%rip),%rcx + + movq $6600291188736,%rdx + xorl %eax,%eax + andq %rdx,%rcx + cmpq %rdx,%rcx + cmoveq %rcx,%rax + .byte 0xf3,0xc3 +.size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable +.text +.globl ossl_aes_gcm_init_avx512 +.type ossl_aes_gcm_init_avx512,@function +.align 32 +ossl_aes_gcm_init_avx512: +.cfi_startproc +.byte 243,15,30,250 + vpxorq %xmm16,%xmm16,%xmm16 + + + movl 240(%rdi),%eax + cmpl $9,%eax + je .Laes_128_0 + cmpl $11,%eax + je .Laes_192_0 + cmpl $13,%eax + je .Laes_256_0 + jmp .Lexit_aes_0 +.align 32 +.Laes_128_0: + vpxorq 0(%rdi),%xmm16,%xmm16 + + vaesenc 16(%rdi),%xmm16,%xmm16 + + vaesenc 32(%rdi),%xmm16,%xmm16 + + vaesenc 48(%rdi),%xmm16,%xmm16 + + vaesenc 64(%rdi),%xmm16,%xmm16 + + vaesenc 80(%rdi),%xmm16,%xmm16 + + vaesenc 96(%rdi),%xmm16,%xmm16 + + vaesenc 112(%rdi),%xmm16,%xmm16 + + vaesenc 128(%rdi),%xmm16,%xmm16 + + vaesenc 144(%rdi),%xmm16,%xmm16 + + vaesenclast 160(%rdi),%xmm16,%xmm16 + jmp .Lexit_aes_0 +.align 32 +.Laes_192_0: + vpxorq 0(%rdi),%xmm16,%xmm16 + + vaesenc 16(%rdi),%xmm16,%xmm16 + + vaesenc 32(%rdi),%xmm16,%xmm16 + + vaesenc 48(%rdi),%xmm16,%xmm16 + + vaesenc 64(%rdi),%xmm16,%xmm16 + + vaesenc 80(%rdi),%xmm16,%xmm16 + + vaesenc 96(%rdi),%xmm16,%xmm16 + + vaesenc 112(%rdi),%xmm16,%xmm16 + + vaesenc 128(%rdi),%xmm16,%xmm16 + + vaesenc 144(%rdi),%xmm16,%xmm16 + + vaesenc 160(%rdi),%xmm16,%xmm16 + + vaesenc 176(%rdi),%xmm16,%xmm16 + + vaesenclast 192(%rdi),%xmm16,%xmm16 + jmp .Lexit_aes_0 +.align 32 +.Laes_256_0: + vpxorq 0(%rdi),%xmm16,%xmm16 + + vaesenc 16(%rdi),%xmm16,%xmm16 + + vaesenc 32(%rdi),%xmm16,%xmm16 + + vaesenc 48(%rdi),%xmm16,%xmm16 + + vaesenc 64(%rdi),%xmm16,%xmm16 + + vaesenc 80(%rdi),%xmm16,%xmm16 + + vaesenc 96(%rdi),%xmm16,%xmm16 + + vaesenc 112(%rdi),%xmm16,%xmm16 + + vaesenc 128(%rdi),%xmm16,%xmm16 + + vaesenc 144(%rdi),%xmm16,%xmm16 + + vaesenc 160(%rdi),%xmm16,%xmm16 + + vaesenc 176(%rdi),%xmm16,%xmm16 + + vaesenc 192(%rdi),%xmm16,%xmm16 + + vaesenc 208(%rdi),%xmm16,%xmm16 + + vaesenclast 224(%rdi),%xmm16,%xmm16 + jmp .Lexit_aes_0 +.Lexit_aes_0: + + vpshufb SHUF_MASK(%rip),%xmm16,%xmm16 + + vmovdqa64 %xmm16,%xmm2 + vpsllq $1,%xmm16,%xmm16 + vpsrlq $63,%xmm2,%xmm2 + vmovdqa %xmm2,%xmm1 + vpslldq $8,%xmm2,%xmm2 + vpsrldq $8,%xmm1,%xmm1 + vporq %xmm2,%xmm16,%xmm16 + + vpshufd $36,%xmm1,%xmm2 + vpcmpeqd TWOONE(%rip),%xmm2,%xmm2 + vpand POLY(%rip),%xmm2,%xmm2 + vpxorq %xmm2,%xmm16,%xmm16 + + vmovdqu64 %xmm16,336(%rsi) + vshufi32x4 $0x00,%ymm16,%ymm16,%ymm4 + vmovdqa %ymm4,%ymm3 + + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm0 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm1 + vpclmulqdq $0x01,%ymm4,%ymm3,%ymm2 + vpclmulqdq $0x10,%ymm4,%ymm3,%ymm3 + vpxorq %ymm2,%ymm3,%ymm3 + + vpsrldq $8,%ymm3,%ymm2 + vpslldq $8,%ymm3,%ymm3 + vpxorq %ymm2,%ymm0,%ymm0 + vpxorq %ymm1,%ymm3,%ymm3 + + + + vmovdqu64 POLY2(%rip),%ymm2 + + vpclmulqdq $0x01,%ymm3,%ymm2,%ymm1 + vpslldq $8,%ymm1,%ymm1 + vpxorq %ymm1,%ymm3,%ymm3 + + + + vpclmulqdq $0x00,%ymm3,%ymm2,%ymm1 + vpsrldq $4,%ymm1,%ymm1 + vpclmulqdq $0x10,%ymm3,%ymm2,%ymm3 + vpslldq $4,%ymm3,%ymm3 + + vpternlogq $0x96,%ymm1,%ymm0,%ymm3 + + vmovdqu64 %xmm3,320(%rsi) + vinserti64x2 $1,%xmm16,%ymm3,%ymm4 + vmovdqa64 %ymm4,%ymm5 + + vpclmulqdq $0x11,%ymm3,%ymm4,%ymm0 + vpclmulqdq $0x00,%ymm3,%ymm4,%ymm1 + vpclmulqdq $0x01,%ymm3,%ymm4,%ymm2 + vpclmulqdq $0x10,%ymm3,%ymm4,%ymm4 + vpxorq %ymm2,%ymm4,%ymm4 + + vpsrldq $8,%ymm4,%ymm2 + vpslldq $8,%ymm4,%ymm4 + vpxorq %ymm2,%ymm0,%ymm0 + vpxorq %ymm1,%ymm4,%ymm4 + + + + vmovdqu64 POLY2(%rip),%ymm2 + + vpclmulqdq $0x01,%ymm4,%ymm2,%ymm1 + vpslldq $8,%ymm1,%ymm1 + vpxorq %ymm1,%ymm4,%ymm4 + + + + vpclmulqdq $0x00,%ymm4,%ymm2,%ymm1 + vpsrldq $4,%ymm1,%ymm1 + vpclmulqdq $0x10,%ymm4,%ymm2,%ymm4 + vpslldq $4,%ymm4,%ymm4 + + vpternlogq $0x96,%ymm1,%ymm0,%ymm4 + + vmovdqu64 %ymm4,288(%rsi) + + vinserti64x4 $1,%ymm5,%zmm4,%zmm4 + + + vshufi64x2 $0x00,%zmm4,%zmm4,%zmm3 + vmovdqa64 %zmm4,%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm0 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm1 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm2 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm2,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm2 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm2,%zmm0,%zmm0 + vpxorq %zmm1,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm2 + + vpclmulqdq $0x01,%zmm4,%zmm2,%zmm1 + vpslldq $8,%zmm1,%zmm1 + vpxorq %zmm1,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm2,%zmm1 + vpsrldq $4,%zmm1,%zmm1 + vpclmulqdq $0x10,%zmm4,%zmm2,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm1,%zmm0,%zmm4 + + vmovdqu64 %zmm4,224(%rsi) + vshufi64x2 $0x00,%zmm4,%zmm4,%zmm3 + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm0 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm1 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm2 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm2,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm2 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm2,%zmm0,%zmm0 + vpxorq %zmm1,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm2 + + vpclmulqdq $0x01,%zmm5,%zmm2,%zmm1 + vpslldq $8,%zmm1,%zmm1 + vpxorq %zmm1,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm2,%zmm1 + vpsrldq $4,%zmm1,%zmm1 + vpclmulqdq $0x10,%zmm5,%zmm2,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm1,%zmm0,%zmm5 + + vmovdqu64 %zmm5,160(%rsi) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm0 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm1 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm2 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm2,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm2 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm2,%zmm0,%zmm0 + vpxorq %zmm1,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm2 + + vpclmulqdq $0x01,%zmm4,%zmm2,%zmm1 + vpslldq $8,%zmm1,%zmm1 + vpxorq %zmm1,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm2,%zmm1 + vpsrldq $4,%zmm1,%zmm1 + vpclmulqdq $0x10,%zmm4,%zmm2,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm1,%zmm0,%zmm4 + + vmovdqu64 %zmm4,96(%rsi) + vzeroupper +.Labort_init: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512 +.globl ossl_aes_gcm_setiv_avx512 +.type ossl_aes_gcm_setiv_avx512,@function +.align 32 +ossl_aes_gcm_setiv_avx512: +.cfi_startproc +.Lsetiv_seh_begin: +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 +.Lsetiv_seh_push_rbx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 +.Lsetiv_seh_push_rbp: + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 +.Lsetiv_seh_push_r12: + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 +.Lsetiv_seh_push_r13: + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 +.Lsetiv_seh_push_r14: + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lsetiv_seh_push_r15: + + + + + + + + + + + leaq 0(%rsp),%rbp +.cfi_def_cfa_register %rbp +.Lsetiv_seh_setfp: + +.Lsetiv_seh_prolog_end: + subq $820,%rsp + andq $(-64),%rsp + cmpq $12,%rcx + je iv_len_12_init_IV + vpxor %xmm2,%xmm2,%xmm2 + movq %rdx,%r10 + movq %rcx,%r11 + orq %r11,%r11 + jz .L_CALC_AAD_done_1 + + xorq %rbx,%rbx + vmovdqa64 SHUF_MASK(%rip),%zmm16 + +.L_get_AAD_loop48x16_1: + cmpq $768,%r11 + jl .L_exit_AAD_loop48x16_1 + vmovdqu64 0(%r10),%zmm11 + vmovdqu64 64(%r10),%zmm3 + vmovdqu64 128(%r10),%zmm4 + vmovdqu64 192(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + testq %rbx,%rbx + jnz .L_skip_hkeys_precomputation_2 + + vmovdqu64 288(%rsi),%zmm1 + vmovdqu64 %zmm1,704(%rsp) + + vmovdqu64 224(%rsi),%zmm9 + vmovdqu64 %zmm9,640(%rsp) + + + vshufi64x2 $0x00,%zmm9,%zmm9,%zmm9 + + vmovdqu64 160(%rsi),%zmm10 + vmovdqu64 %zmm10,576(%rsp) + + vmovdqu64 96(%rsi),%zmm12 + vmovdqu64 %zmm12,512(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,448(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,384(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,320(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,256(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,192(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,128(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,64(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,0(%rsp) +.L_skip_hkeys_precomputation_2: + movq $1,%rbx + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 0(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 64(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpxorq %zmm17,%zmm10,%zmm7 + vpxorq %zmm13,%zmm1,%zmm6 + vpxorq %zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 128(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 192(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 256(%r10),%zmm11 + vmovdqu64 320(%r10),%zmm3 + vmovdqu64 384(%r10),%zmm4 + vmovdqu64 448(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vmovdqu64 256(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 320(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 384(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 448(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 512(%r10),%zmm11 + vmovdqu64 576(%r10),%zmm3 + vmovdqu64 640(%r10),%zmm4 + vmovdqu64 704(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vmovdqu64 512(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 576(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 640(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 704(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + + vpsrldq $8,%zmm7,%zmm1 + vpslldq $8,%zmm7,%zmm9 + vpxorq %zmm1,%zmm6,%zmm6 + vpxorq %zmm9,%zmm8,%zmm8 + vextracti64x4 $1,%zmm6,%ymm1 + vpxorq %ymm1,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm1 + vpxorq %xmm1,%xmm6,%xmm6 + vextracti64x4 $1,%zmm8,%ymm9 + vpxorq %ymm9,%ymm8,%ymm8 + vextracti32x4 $1,%ymm8,%xmm9 + vpxorq %xmm9,%xmm8,%xmm8 + vmovdqa64 POLY2(%rip),%xmm10 + + + vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1 + vpslldq $8,%xmm1,%xmm1 + vpxorq %xmm1,%xmm8,%xmm1 + + + vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9 + vpsrldq $4,%xmm9,%xmm9 + vpclmulqdq $0x10,%xmm1,%xmm10,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm6,%xmm9,%xmm2 + + subq $768,%r11 + je .L_CALC_AAD_done_1 + + addq $768,%r10 + jmp .L_get_AAD_loop48x16_1 + +.L_exit_AAD_loop48x16_1: + + cmpq $512,%r11 + jl .L_less_than_32x16_1 + + vmovdqu64 0(%r10),%zmm11 + vmovdqu64 64(%r10),%zmm3 + vmovdqu64 128(%r10),%zmm4 + vmovdqu64 192(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + testq %rbx,%rbx + jnz .L_skip_hkeys_precomputation_3 + + vmovdqu64 288(%rsi),%zmm1 + vmovdqu64 %zmm1,704(%rsp) + + vmovdqu64 224(%rsi),%zmm9 + vmovdqu64 %zmm9,640(%rsp) + + + vshufi64x2 $0x00,%zmm9,%zmm9,%zmm9 + + vmovdqu64 160(%rsi),%zmm10 + vmovdqu64 %zmm10,576(%rsp) + + vmovdqu64 96(%rsi),%zmm12 + vmovdqu64 %zmm12,512(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,448(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,384(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,320(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,256(%rsp) +.L_skip_hkeys_precomputation_3: + movq $1,%rbx + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 256(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 320(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpxorq %zmm17,%zmm10,%zmm7 + vpxorq %zmm13,%zmm1,%zmm6 + vpxorq %zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 384(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 448(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 256(%r10),%zmm11 + vmovdqu64 320(%r10),%zmm3 + vmovdqu64 384(%r10),%zmm4 + vmovdqu64 448(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vmovdqu64 512(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 576(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 640(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 704(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + + vpsrldq $8,%zmm7,%zmm1 + vpslldq $8,%zmm7,%zmm9 + vpxorq %zmm1,%zmm6,%zmm6 + vpxorq %zmm9,%zmm8,%zmm8 + vextracti64x4 $1,%zmm6,%ymm1 + vpxorq %ymm1,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm1 + vpxorq %xmm1,%xmm6,%xmm6 + vextracti64x4 $1,%zmm8,%ymm9 + vpxorq %ymm9,%ymm8,%ymm8 + vextracti32x4 $1,%ymm8,%xmm9 + vpxorq %xmm9,%xmm8,%xmm8 + vmovdqa64 POLY2(%rip),%xmm10 + + + vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1 + vpslldq $8,%xmm1,%xmm1 + vpxorq %xmm1,%xmm8,%xmm1 + + + vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9 + vpsrldq $4,%xmm9,%xmm9 + vpclmulqdq $0x10,%xmm1,%xmm10,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm6,%xmm9,%xmm2 + + subq $512,%r11 + je .L_CALC_AAD_done_1 + + addq $512,%r10 + jmp .L_less_than_16x16_1 + +.L_less_than_32x16_1: + cmpq $256,%r11 + jl .L_less_than_16x16_1 + + vmovdqu64 0(%r10),%zmm11 + vmovdqu64 64(%r10),%zmm3 + vmovdqu64 128(%r10),%zmm4 + vmovdqu64 192(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 96(%rsi),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 160(%rsi),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpxorq %zmm17,%zmm10,%zmm7 + vpxorq %zmm13,%zmm1,%zmm6 + vpxorq %zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 224(%rsi),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 288(%rsi),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + + vpsrldq $8,%zmm7,%zmm1 + vpslldq $8,%zmm7,%zmm9 + vpxorq %zmm1,%zmm6,%zmm6 + vpxorq %zmm9,%zmm8,%zmm8 + vextracti64x4 $1,%zmm6,%ymm1 + vpxorq %ymm1,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm1 + vpxorq %xmm1,%xmm6,%xmm6 + vextracti64x4 $1,%zmm8,%ymm9 + vpxorq %ymm9,%ymm8,%ymm8 + vextracti32x4 $1,%ymm8,%xmm9 + vpxorq %xmm9,%xmm8,%xmm8 + vmovdqa64 POLY2(%rip),%xmm10 + + + vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1 + vpslldq $8,%xmm1,%xmm1 + vpxorq %xmm1,%xmm8,%xmm1 + + + vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9 + vpsrldq $4,%xmm9,%xmm9 + vpclmulqdq $0x10,%xmm1,%xmm10,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm6,%xmm9,%xmm2 + + subq $256,%r11 + je .L_CALC_AAD_done_1 + + addq $256,%r10 + +.L_less_than_16x16_1: + + leaq byte64_len_to_mask_table(%rip),%r12 + leaq (%r12,%r11,8),%r12 + + + addl $15,%r11d + shrl $4,%r11d + cmpl $2,%r11d + jb .L_AAD_blocks_1_1 + je .L_AAD_blocks_2_1 + cmpl $4,%r11d + jb .L_AAD_blocks_3_1 + je .L_AAD_blocks_4_1 + cmpl $6,%r11d + jb .L_AAD_blocks_5_1 + je .L_AAD_blocks_6_1 + cmpl $8,%r11d + jb .L_AAD_blocks_7_1 + je .L_AAD_blocks_8_1 + cmpl $10,%r11d + jb .L_AAD_blocks_9_1 + je .L_AAD_blocks_10_1 + cmpl $12,%r11d + jb .L_AAD_blocks_11_1 + je .L_AAD_blocks_12_1 + cmpl $14,%r11d + jb .L_AAD_blocks_13_1 + je .L_AAD_blocks_14_1 + cmpl $15,%r11d + je .L_AAD_blocks_15_1 +.L_AAD_blocks_16_1: + subq $1536,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4 + vmovdqu8 192(%r10),%zmm5{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 96(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 160(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vmovdqu64 224(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm9,%zmm11,%zmm1 + vpternlogq $0x96,%zmm10,%zmm3,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm12,%zmm11,%zmm7 + vpternlogq $0x96,%zmm13,%zmm3,%zmm8 + vmovdqu64 288(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm5,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm5,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm5,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm5,%zmm13 + vpxorq %zmm9,%zmm1,%zmm9 + vpxorq %zmm10,%zmm6,%zmm10 + vpxorq %zmm12,%zmm7,%zmm12 + vpxorq %zmm13,%zmm8,%zmm13 + + vpxorq %zmm13,%zmm12,%zmm12 + vpsrldq $8,%zmm12,%zmm7 + vpslldq $8,%zmm12,%zmm8 + vpxorq %zmm7,%zmm9,%zmm1 + vpxorq %zmm8,%zmm10,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_1 +.L_AAD_blocks_15_1: + subq $1536,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4 + vmovdqu8 192(%r10),%zmm5{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 112(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 176(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vmovdqu64 240(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm1,%zmm11,%zmm9 + vpternlogq $0x96,%zmm6,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm7,%zmm11,%zmm12 + vpternlogq $0x96,%zmm8,%zmm3,%zmm13 + vmovdqu64 304(%rsi),%ymm15 + vinserti64x2 $2,336(%rsi),%zmm15,%zmm15 + vpclmulqdq $0x01,%zmm15,%zmm5,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm5,%zmm8 + vpclmulqdq $0x11,%zmm15,%zmm5,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm5,%zmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_1 +.L_AAD_blocks_14_1: + subq $1536,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4 + vmovdqu8 192(%r10),%ymm5{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %ymm16,%ymm5,%ymm5 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 128(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 192(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vmovdqu64 256(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm1,%zmm11,%zmm9 + vpternlogq $0x96,%zmm6,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm7,%zmm11,%zmm12 + vpternlogq $0x96,%zmm8,%zmm3,%zmm13 + vmovdqu64 320(%rsi),%ymm15 + vpclmulqdq $0x01,%ymm15,%ymm5,%ymm7 + vpclmulqdq $0x10,%ymm15,%ymm5,%ymm8 + vpclmulqdq $0x11,%ymm15,%ymm5,%ymm1 + vpclmulqdq $0x00,%ymm15,%ymm5,%ymm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_1 +.L_AAD_blocks_13_1: + subq $1536,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4 + vmovdqu8 192(%r10),%xmm5{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %xmm16,%xmm5,%xmm5 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 144(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 208(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vmovdqu64 272(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm1,%zmm11,%zmm9 + vpternlogq $0x96,%zmm6,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm7,%zmm11,%zmm12 + vpternlogq $0x96,%zmm8,%zmm3,%zmm13 + vmovdqu64 336(%rsi),%xmm15 + vpclmulqdq $0x01,%xmm15,%xmm5,%xmm7 + vpclmulqdq $0x10,%xmm15,%xmm5,%xmm8 + vpclmulqdq $0x11,%xmm15,%xmm5,%xmm1 + vpclmulqdq $0x00,%xmm15,%xmm5,%xmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_1 +.L_AAD_blocks_12_1: + subq $1024,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 160(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 224(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vmovdqu64 288(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm1,%zmm11,%zmm9 + vpternlogq $0x96,%zmm6,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm7,%zmm11,%zmm12 + vpternlogq $0x96,%zmm8,%zmm3,%zmm13 + + vpxorq %zmm13,%zmm12,%zmm12 + vpsrldq $8,%zmm12,%zmm7 + vpslldq $8,%zmm12,%zmm8 + vpxorq %zmm7,%zmm9,%zmm1 + vpxorq %zmm8,%zmm10,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_1 +.L_AAD_blocks_11_1: + subq $1024,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 176(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 240(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vpxorq %zmm9,%zmm1,%zmm9 + vpxorq %zmm10,%zmm6,%zmm10 + vpxorq %zmm12,%zmm7,%zmm12 + vpxorq %zmm13,%zmm8,%zmm13 + vmovdqu64 304(%rsi),%ymm15 + vinserti64x2 $2,336(%rsi),%zmm15,%zmm15 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm8 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_1 +.L_AAD_blocks_10_1: + subq $1024,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%ymm4{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %ymm16,%ymm4,%ymm4 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 192(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 256(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vpxorq %zmm9,%zmm1,%zmm9 + vpxorq %zmm10,%zmm6,%zmm10 + vpxorq %zmm12,%zmm7,%zmm12 + vpxorq %zmm13,%zmm8,%zmm13 + vmovdqu64 320(%rsi),%ymm15 + vpclmulqdq $0x01,%ymm15,%ymm4,%ymm7 + vpclmulqdq $0x10,%ymm15,%ymm4,%ymm8 + vpclmulqdq $0x11,%ymm15,%ymm4,%ymm1 + vpclmulqdq $0x00,%ymm15,%ymm4,%ymm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_1 +.L_AAD_blocks_9_1: + subq $1024,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%xmm4{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %xmm16,%xmm4,%xmm4 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 208(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 272(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vpxorq %zmm9,%zmm1,%zmm9 + vpxorq %zmm10,%zmm6,%zmm10 + vpxorq %zmm12,%zmm7,%zmm12 + vpxorq %zmm13,%zmm8,%zmm13 + vmovdqu64 336(%rsi),%xmm15 + vpclmulqdq $0x01,%xmm15,%xmm4,%xmm7 + vpclmulqdq $0x10,%xmm15,%xmm4,%xmm8 + vpclmulqdq $0x11,%xmm15,%xmm4,%xmm1 + vpclmulqdq $0x00,%xmm15,%xmm4,%xmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_1 +.L_AAD_blocks_8_1: + subq $512,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 224(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 288(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vpxorq %zmm9,%zmm1,%zmm9 + vpxorq %zmm10,%zmm6,%zmm10 + vpxorq %zmm12,%zmm7,%zmm12 + vpxorq %zmm13,%zmm8,%zmm13 + + vpxorq %zmm13,%zmm12,%zmm12 + vpsrldq $8,%zmm12,%zmm7 + vpslldq $8,%zmm12,%zmm8 + vpxorq %zmm7,%zmm9,%zmm1 + vpxorq %zmm8,%zmm10,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_1 +.L_AAD_blocks_7_1: + subq $512,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 240(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13 + vmovdqu64 304(%rsi),%ymm15 + vinserti64x2 $2,336(%rsi),%zmm15,%zmm15 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm8 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_1 +.L_AAD_blocks_6_1: + subq $512,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%ymm3{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %ymm16,%ymm3,%ymm3 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 256(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13 + vmovdqu64 320(%rsi),%ymm15 + vpclmulqdq $0x01,%ymm15,%ymm3,%ymm7 + vpclmulqdq $0x10,%ymm15,%ymm3,%ymm8 + vpclmulqdq $0x11,%ymm15,%ymm3,%ymm1 + vpclmulqdq $0x00,%ymm15,%ymm3,%ymm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_1 +.L_AAD_blocks_5_1: + subq $512,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%xmm3{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %xmm16,%xmm3,%xmm3 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 272(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13 + vmovdqu64 336(%rsi),%xmm15 + vpclmulqdq $0x01,%xmm15,%xmm3,%xmm7 + vpclmulqdq $0x10,%xmm15,%xmm3,%xmm8 + vpclmulqdq $0x11,%xmm15,%xmm3,%xmm1 + vpclmulqdq $0x00,%xmm15,%xmm3,%xmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_1 +.L_AAD_blocks_4_1: + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 288(%rsi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13 + + vpxorq %zmm13,%zmm12,%zmm12 + vpsrldq $8,%zmm12,%zmm7 + vpslldq $8,%zmm12,%zmm8 + vpxorq %zmm7,%zmm9,%zmm1 + vpxorq %zmm8,%zmm10,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_1 +.L_AAD_blocks_3_1: + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 304(%rsi),%ymm15 + vinserti64x2 $2,336(%rsi),%zmm15,%zmm15 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_1 +.L_AAD_blocks_2_1: + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%ymm11{%k1}{z} + vpshufb %ymm16,%ymm11,%ymm11 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 320(%rsi),%ymm15 + vpclmulqdq $0x01,%ymm15,%ymm11,%ymm7 + vpclmulqdq $0x10,%ymm15,%ymm11,%ymm8 + vpclmulqdq $0x11,%ymm15,%ymm11,%ymm1 + vpclmulqdq $0x00,%ymm15,%ymm11,%ymm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + + jmp .L_CALC_AAD_done_1 +.L_AAD_blocks_1_1: + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%xmm11{%k1}{z} + vpshufb %xmm16,%xmm11,%xmm11 + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 336(%rsi),%xmm15 + vpclmulqdq $0x01,%xmm15,%xmm11,%xmm7 + vpclmulqdq $0x10,%xmm15,%xmm11,%xmm8 + vpclmulqdq $0x11,%xmm15,%xmm11,%xmm1 + vpclmulqdq $0x00,%xmm15,%xmm11,%xmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm1,%xmm8,%xmm2 + +.L_CALC_AAD_done_1: + movq %rcx,%r10 + shlq $3,%r10 + vmovq %r10,%xmm3 + + + vpxorq %xmm2,%xmm3,%xmm2 + + vmovdqu64 336(%rsi),%xmm1 + + vpclmulqdq $0x11,%xmm1,%xmm2,%xmm11 + vpclmulqdq $0x00,%xmm1,%xmm2,%xmm3 + vpclmulqdq $0x01,%xmm1,%xmm2,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm2,%xmm2 + vpxorq %xmm4,%xmm2,%xmm2 + + vpsrldq $8,%xmm2,%xmm4 + vpslldq $8,%xmm2,%xmm2 + vpxorq %xmm4,%xmm11,%xmm11 + vpxorq %xmm3,%xmm2,%xmm2 + + + + vmovdqu64 POLY2(%rip),%xmm4 + + vpclmulqdq $0x01,%xmm2,%xmm4,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm2,%xmm2 + + + + vpclmulqdq $0x00,%xmm2,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm2,%xmm4,%xmm2 + vpslldq $4,%xmm2,%xmm2 + + vpternlogq $0x96,%xmm3,%xmm11,%xmm2 + + vpshufb SHUF_MASK(%rip),%xmm2,%xmm2 + jmp skip_iv_len_12_init_IV +iv_len_12_init_IV: + + vmovdqu8 ONEf(%rip),%xmm2 + movq %rdx,%r11 + movl $0x0000000000000fff,%r10d + kmovq %r10,%k1 + vmovdqu8 (%r11),%xmm2{%k1} +skip_iv_len_12_init_IV: + vmovdqu %xmm2,%xmm1 + + + movl 240(%rdi),%r10d + cmpl $9,%r10d + je .Laes_128_4 + cmpl $11,%r10d + je .Laes_192_4 + cmpl $13,%r10d + je .Laes_256_4 + jmp .Lexit_aes_4 +.align 32 +.Laes_128_4: + vpxorq 0(%rdi),%xmm1,%xmm1 + + vaesenc 16(%rdi),%xmm1,%xmm1 + + vaesenc 32(%rdi),%xmm1,%xmm1 + + vaesenc 48(%rdi),%xmm1,%xmm1 + + vaesenc 64(%rdi),%xmm1,%xmm1 + + vaesenc 80(%rdi),%xmm1,%xmm1 + + vaesenc 96(%rdi),%xmm1,%xmm1 + + vaesenc 112(%rdi),%xmm1,%xmm1 + + vaesenc 128(%rdi),%xmm1,%xmm1 + + vaesenc 144(%rdi),%xmm1,%xmm1 + + vaesenclast 160(%rdi),%xmm1,%xmm1 + jmp .Lexit_aes_4 +.align 32 +.Laes_192_4: + vpxorq 0(%rdi),%xmm1,%xmm1 + + vaesenc 16(%rdi),%xmm1,%xmm1 + + vaesenc 32(%rdi),%xmm1,%xmm1 + + vaesenc 48(%rdi),%xmm1,%xmm1 + + vaesenc 64(%rdi),%xmm1,%xmm1 + + vaesenc 80(%rdi),%xmm1,%xmm1 + + vaesenc 96(%rdi),%xmm1,%xmm1 + + vaesenc 112(%rdi),%xmm1,%xmm1 + + vaesenc 128(%rdi),%xmm1,%xmm1 + + vaesenc 144(%rdi),%xmm1,%xmm1 + + vaesenc 160(%rdi),%xmm1,%xmm1 + + vaesenc 176(%rdi),%xmm1,%xmm1 + + vaesenclast 192(%rdi),%xmm1,%xmm1 + jmp .Lexit_aes_4 +.align 32 +.Laes_256_4: + vpxorq 0(%rdi),%xmm1,%xmm1 + + vaesenc 16(%rdi),%xmm1,%xmm1 + + vaesenc 32(%rdi),%xmm1,%xmm1 + + vaesenc 48(%rdi),%xmm1,%xmm1 + + vaesenc 64(%rdi),%xmm1,%xmm1 + + vaesenc 80(%rdi),%xmm1,%xmm1 + + vaesenc 96(%rdi),%xmm1,%xmm1 + + vaesenc 112(%rdi),%xmm1,%xmm1 + + vaesenc 128(%rdi),%xmm1,%xmm1 + + vaesenc 144(%rdi),%xmm1,%xmm1 + + vaesenc 160(%rdi),%xmm1,%xmm1 + + vaesenc 176(%rdi),%xmm1,%xmm1 + + vaesenc 192(%rdi),%xmm1,%xmm1 + + vaesenc 208(%rdi),%xmm1,%xmm1 + + vaesenclast 224(%rdi),%xmm1,%xmm1 + jmp .Lexit_aes_4 +.Lexit_aes_4: + + vmovdqu %xmm1,32(%rsi) + + + vpshufb SHUF_MASK(%rip),%xmm2,%xmm2 + vmovdqu %xmm2,0(%rsi) + cmpq $256,%rcx + jbe .Lskip_hkeys_cleanup_5 + vpxor %xmm0,%xmm0,%xmm0 + vmovdqa64 %zmm0,0(%rsp) + vmovdqa64 %zmm0,64(%rsp) + vmovdqa64 %zmm0,128(%rsp) + vmovdqa64 %zmm0,192(%rsp) + vmovdqa64 %zmm0,256(%rsp) + vmovdqa64 %zmm0,320(%rsp) + vmovdqa64 %zmm0,384(%rsp) + vmovdqa64 %zmm0,448(%rsp) + vmovdqa64 %zmm0,512(%rsp) + vmovdqa64 %zmm0,576(%rsp) + vmovdqa64 %zmm0,640(%rsp) + vmovdqa64 %zmm0,704(%rsp) +.Lskip_hkeys_cleanup_5: + vzeroupper + leaq (%rbp),%rsp +.cfi_def_cfa_register %rsp + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx +.Labort_setiv: + .byte 0xf3,0xc3 +.Lsetiv_seh_end: +.cfi_endproc +.size ossl_aes_gcm_setiv_avx512, .-ossl_aes_gcm_setiv_avx512 +.globl ossl_aes_gcm_update_aad_avx512 +.type ossl_aes_gcm_update_aad_avx512,@function +.align 32 +ossl_aes_gcm_update_aad_avx512: +.cfi_startproc +.Lghash_seh_begin: +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 +.Lghash_seh_push_rbx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 +.Lghash_seh_push_rbp: + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 +.Lghash_seh_push_r12: + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 +.Lghash_seh_push_r13: + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 +.Lghash_seh_push_r14: + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lghash_seh_push_r15: + + + + + + + + + + + leaq 0(%rsp),%rbp +.cfi_def_cfa_register %rbp +.Lghash_seh_setfp: + +.Lghash_seh_prolog_end: + subq $820,%rsp + andq $(-64),%rsp + vmovdqu64 64(%rdi),%xmm14 + movq %rsi,%r10 + movq %rdx,%r11 + orq %r11,%r11 + jz .L_CALC_AAD_done_6 + + xorq %rbx,%rbx + vmovdqa64 SHUF_MASK(%rip),%zmm16 + +.L_get_AAD_loop48x16_6: + cmpq $768,%r11 + jl .L_exit_AAD_loop48x16_6 + vmovdqu64 0(%r10),%zmm11 + vmovdqu64 64(%r10),%zmm3 + vmovdqu64 128(%r10),%zmm4 + vmovdqu64 192(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + testq %rbx,%rbx + jnz .L_skip_hkeys_precomputation_7 + + vmovdqu64 288(%rdi),%zmm1 + vmovdqu64 %zmm1,704(%rsp) + + vmovdqu64 224(%rdi),%zmm9 + vmovdqu64 %zmm9,640(%rsp) + + + vshufi64x2 $0x00,%zmm9,%zmm9,%zmm9 + + vmovdqu64 160(%rdi),%zmm10 + vmovdqu64 %zmm10,576(%rsp) + + vmovdqu64 96(%rdi),%zmm12 + vmovdqu64 %zmm12,512(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,448(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,384(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,320(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,256(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,192(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,128(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,64(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,0(%rsp) +.L_skip_hkeys_precomputation_7: + movq $1,%rbx + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 0(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 64(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpxorq %zmm17,%zmm10,%zmm7 + vpxorq %zmm13,%zmm1,%zmm6 + vpxorq %zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 128(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 192(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 256(%r10),%zmm11 + vmovdqu64 320(%r10),%zmm3 + vmovdqu64 384(%r10),%zmm4 + vmovdqu64 448(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vmovdqu64 256(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 320(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 384(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 448(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 512(%r10),%zmm11 + vmovdqu64 576(%r10),%zmm3 + vmovdqu64 640(%r10),%zmm4 + vmovdqu64 704(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vmovdqu64 512(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 576(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 640(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 704(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + + vpsrldq $8,%zmm7,%zmm1 + vpslldq $8,%zmm7,%zmm9 + vpxorq %zmm1,%zmm6,%zmm6 + vpxorq %zmm9,%zmm8,%zmm8 + vextracti64x4 $1,%zmm6,%ymm1 + vpxorq %ymm1,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm1 + vpxorq %xmm1,%xmm6,%xmm6 + vextracti64x4 $1,%zmm8,%ymm9 + vpxorq %ymm9,%ymm8,%ymm8 + vextracti32x4 $1,%ymm8,%xmm9 + vpxorq %xmm9,%xmm8,%xmm8 + vmovdqa64 POLY2(%rip),%xmm10 + + + vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1 + vpslldq $8,%xmm1,%xmm1 + vpxorq %xmm1,%xmm8,%xmm1 + + + vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9 + vpsrldq $4,%xmm9,%xmm9 + vpclmulqdq $0x10,%xmm1,%xmm10,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm6,%xmm9,%xmm14 + + subq $768,%r11 + je .L_CALC_AAD_done_6 + + addq $768,%r10 + jmp .L_get_AAD_loop48x16_6 + +.L_exit_AAD_loop48x16_6: + + cmpq $512,%r11 + jl .L_less_than_32x16_6 + + vmovdqu64 0(%r10),%zmm11 + vmovdqu64 64(%r10),%zmm3 + vmovdqu64 128(%r10),%zmm4 + vmovdqu64 192(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + testq %rbx,%rbx + jnz .L_skip_hkeys_precomputation_8 + + vmovdqu64 288(%rdi),%zmm1 + vmovdqu64 %zmm1,704(%rsp) + + vmovdqu64 224(%rdi),%zmm9 + vmovdqu64 %zmm9,640(%rsp) + + + vshufi64x2 $0x00,%zmm9,%zmm9,%zmm9 + + vmovdqu64 160(%rdi),%zmm10 + vmovdqu64 %zmm10,576(%rsp) + + vmovdqu64 96(%rdi),%zmm12 + vmovdqu64 %zmm12,512(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,448(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,384(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,320(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,256(%rsp) +.L_skip_hkeys_precomputation_8: + movq $1,%rbx + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 256(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 320(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpxorq %zmm17,%zmm10,%zmm7 + vpxorq %zmm13,%zmm1,%zmm6 + vpxorq %zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 384(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 448(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 256(%r10),%zmm11 + vmovdqu64 320(%r10),%zmm3 + vmovdqu64 384(%r10),%zmm4 + vmovdqu64 448(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vmovdqu64 512(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 576(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 640(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 704(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + + vpsrldq $8,%zmm7,%zmm1 + vpslldq $8,%zmm7,%zmm9 + vpxorq %zmm1,%zmm6,%zmm6 + vpxorq %zmm9,%zmm8,%zmm8 + vextracti64x4 $1,%zmm6,%ymm1 + vpxorq %ymm1,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm1 + vpxorq %xmm1,%xmm6,%xmm6 + vextracti64x4 $1,%zmm8,%ymm9 + vpxorq %ymm9,%ymm8,%ymm8 + vextracti32x4 $1,%ymm8,%xmm9 + vpxorq %xmm9,%xmm8,%xmm8 + vmovdqa64 POLY2(%rip),%xmm10 + + + vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1 + vpslldq $8,%xmm1,%xmm1 + vpxorq %xmm1,%xmm8,%xmm1 + + + vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9 + vpsrldq $4,%xmm9,%xmm9 + vpclmulqdq $0x10,%xmm1,%xmm10,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm6,%xmm9,%xmm14 + + subq $512,%r11 + je .L_CALC_AAD_done_6 + + addq $512,%r10 + jmp .L_less_than_16x16_6 + +.L_less_than_32x16_6: + cmpq $256,%r11 + jl .L_less_than_16x16_6 + + vmovdqu64 0(%r10),%zmm11 + vmovdqu64 64(%r10),%zmm3 + vmovdqu64 128(%r10),%zmm4 + vmovdqu64 192(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 96(%rdi),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 160(%rdi),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpxorq %zmm17,%zmm10,%zmm7 + vpxorq %zmm13,%zmm1,%zmm6 + vpxorq %zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 224(%rdi),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 288(%rdi),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + + vpsrldq $8,%zmm7,%zmm1 + vpslldq $8,%zmm7,%zmm9 + vpxorq %zmm1,%zmm6,%zmm6 + vpxorq %zmm9,%zmm8,%zmm8 + vextracti64x4 $1,%zmm6,%ymm1 + vpxorq %ymm1,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm1 + vpxorq %xmm1,%xmm6,%xmm6 + vextracti64x4 $1,%zmm8,%ymm9 + vpxorq %ymm9,%ymm8,%ymm8 + vextracti32x4 $1,%ymm8,%xmm9 + vpxorq %xmm9,%xmm8,%xmm8 + vmovdqa64 POLY2(%rip),%xmm10 + + + vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1 + vpslldq $8,%xmm1,%xmm1 + vpxorq %xmm1,%xmm8,%xmm1 + + + vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9 + vpsrldq $4,%xmm9,%xmm9 + vpclmulqdq $0x10,%xmm1,%xmm10,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm6,%xmm9,%xmm14 + + subq $256,%r11 + je .L_CALC_AAD_done_6 + + addq $256,%r10 + +.L_less_than_16x16_6: + + leaq byte64_len_to_mask_table(%rip),%r12 + leaq (%r12,%r11,8),%r12 + + + addl $15,%r11d + shrl $4,%r11d + cmpl $2,%r11d + jb .L_AAD_blocks_1_6 + je .L_AAD_blocks_2_6 + cmpl $4,%r11d + jb .L_AAD_blocks_3_6 + je .L_AAD_blocks_4_6 + cmpl $6,%r11d + jb .L_AAD_blocks_5_6 + je .L_AAD_blocks_6_6 + cmpl $8,%r11d + jb .L_AAD_blocks_7_6 + je .L_AAD_blocks_8_6 + cmpl $10,%r11d + jb .L_AAD_blocks_9_6 + je .L_AAD_blocks_10_6 + cmpl $12,%r11d + jb .L_AAD_blocks_11_6 + je .L_AAD_blocks_12_6 + cmpl $14,%r11d + jb .L_AAD_blocks_13_6 + je .L_AAD_blocks_14_6 + cmpl $15,%r11d + je .L_AAD_blocks_15_6 +.L_AAD_blocks_16_6: + subq $1536,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4 + vmovdqu8 192(%r10),%zmm5{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 96(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 160(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vmovdqu64 224(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm9,%zmm11,%zmm1 + vpternlogq $0x96,%zmm10,%zmm3,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm12,%zmm11,%zmm7 + vpternlogq $0x96,%zmm13,%zmm3,%zmm8 + vmovdqu64 288(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm5,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm5,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm5,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm5,%zmm13 + vpxorq %zmm9,%zmm1,%zmm9 + vpxorq %zmm10,%zmm6,%zmm10 + vpxorq %zmm12,%zmm7,%zmm12 + vpxorq %zmm13,%zmm8,%zmm13 + + vpxorq %zmm13,%zmm12,%zmm12 + vpsrldq $8,%zmm12,%zmm7 + vpslldq $8,%zmm12,%zmm8 + vpxorq %zmm7,%zmm9,%zmm1 + vpxorq %zmm8,%zmm10,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_6 +.L_AAD_blocks_15_6: + subq $1536,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4 + vmovdqu8 192(%r10),%zmm5{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 112(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 176(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vmovdqu64 240(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm1,%zmm11,%zmm9 + vpternlogq $0x96,%zmm6,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm7,%zmm11,%zmm12 + vpternlogq $0x96,%zmm8,%zmm3,%zmm13 + vmovdqu64 304(%rdi),%ymm15 + vinserti64x2 $2,336(%rdi),%zmm15,%zmm15 + vpclmulqdq $0x01,%zmm15,%zmm5,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm5,%zmm8 + vpclmulqdq $0x11,%zmm15,%zmm5,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm5,%zmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_6 +.L_AAD_blocks_14_6: + subq $1536,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4 + vmovdqu8 192(%r10),%ymm5{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %ymm16,%ymm5,%ymm5 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 128(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 192(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vmovdqu64 256(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm1,%zmm11,%zmm9 + vpternlogq $0x96,%zmm6,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm7,%zmm11,%zmm12 + vpternlogq $0x96,%zmm8,%zmm3,%zmm13 + vmovdqu64 320(%rdi),%ymm15 + vpclmulqdq $0x01,%ymm15,%ymm5,%ymm7 + vpclmulqdq $0x10,%ymm15,%ymm5,%ymm8 + vpclmulqdq $0x11,%ymm15,%ymm5,%ymm1 + vpclmulqdq $0x00,%ymm15,%ymm5,%ymm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_6 +.L_AAD_blocks_13_6: + subq $1536,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4 + vmovdqu8 192(%r10),%xmm5{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %xmm16,%xmm5,%xmm5 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 144(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 208(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vmovdqu64 272(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm1,%zmm11,%zmm9 + vpternlogq $0x96,%zmm6,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm7,%zmm11,%zmm12 + vpternlogq $0x96,%zmm8,%zmm3,%zmm13 + vmovdqu64 336(%rdi),%xmm15 + vpclmulqdq $0x01,%xmm15,%xmm5,%xmm7 + vpclmulqdq $0x10,%xmm15,%xmm5,%xmm8 + vpclmulqdq $0x11,%xmm15,%xmm5,%xmm1 + vpclmulqdq $0x00,%xmm15,%xmm5,%xmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_6 +.L_AAD_blocks_12_6: + subq $1024,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 160(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 224(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vmovdqu64 288(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm1,%zmm11,%zmm9 + vpternlogq $0x96,%zmm6,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3 + vpternlogq $0x96,%zmm7,%zmm11,%zmm12 + vpternlogq $0x96,%zmm8,%zmm3,%zmm13 + + vpxorq %zmm13,%zmm12,%zmm12 + vpsrldq $8,%zmm12,%zmm7 + vpslldq $8,%zmm12,%zmm8 + vpxorq %zmm7,%zmm9,%zmm1 + vpxorq %zmm8,%zmm10,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_6 +.L_AAD_blocks_11_6: + subq $1024,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%zmm4{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 176(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 240(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vpxorq %zmm9,%zmm1,%zmm9 + vpxorq %zmm10,%zmm6,%zmm10 + vpxorq %zmm12,%zmm7,%zmm12 + vpxorq %zmm13,%zmm8,%zmm13 + vmovdqu64 304(%rdi),%ymm15 + vinserti64x2 $2,336(%rdi),%zmm15,%zmm15 + vpclmulqdq $0x01,%zmm15,%zmm4,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm4,%zmm8 + vpclmulqdq $0x11,%zmm15,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm4,%zmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_6 +.L_AAD_blocks_10_6: + subq $1024,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%ymm4{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %ymm16,%ymm4,%ymm4 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 192(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 256(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vpxorq %zmm9,%zmm1,%zmm9 + vpxorq %zmm10,%zmm6,%zmm10 + vpxorq %zmm12,%zmm7,%zmm12 + vpxorq %zmm13,%zmm8,%zmm13 + vmovdqu64 320(%rdi),%ymm15 + vpclmulqdq $0x01,%ymm15,%ymm4,%ymm7 + vpclmulqdq $0x10,%ymm15,%ymm4,%ymm8 + vpclmulqdq $0x11,%ymm15,%ymm4,%ymm1 + vpclmulqdq $0x00,%ymm15,%ymm4,%ymm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_6 +.L_AAD_blocks_9_6: + subq $1024,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3 + vmovdqu8 128(%r10),%xmm4{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %xmm16,%xmm4,%xmm4 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 208(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 272(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vpxorq %zmm9,%zmm1,%zmm9 + vpxorq %zmm10,%zmm6,%zmm10 + vpxorq %zmm12,%zmm7,%zmm12 + vpxorq %zmm13,%zmm8,%zmm13 + vmovdqu64 336(%rdi),%xmm15 + vpclmulqdq $0x01,%xmm15,%xmm4,%xmm7 + vpclmulqdq $0x10,%xmm15,%xmm4,%xmm8 + vpclmulqdq $0x11,%xmm15,%xmm4,%xmm1 + vpclmulqdq $0x00,%xmm15,%xmm4,%xmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_6 +.L_AAD_blocks_8_6: + subq $512,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 224(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vmovdqu64 288(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13 + vpxorq %zmm9,%zmm1,%zmm9 + vpxorq %zmm10,%zmm6,%zmm10 + vpxorq %zmm12,%zmm7,%zmm12 + vpxorq %zmm13,%zmm8,%zmm13 + + vpxorq %zmm13,%zmm12,%zmm12 + vpsrldq $8,%zmm12,%zmm7 + vpslldq $8,%zmm12,%zmm8 + vpxorq %zmm7,%zmm9,%zmm1 + vpxorq %zmm8,%zmm10,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_6 +.L_AAD_blocks_7_6: + subq $512,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%zmm3{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 240(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13 + vmovdqu64 304(%rdi),%ymm15 + vinserti64x2 $2,336(%rdi),%zmm15,%zmm15 + vpclmulqdq $0x01,%zmm15,%zmm3,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm3,%zmm8 + vpclmulqdq $0x11,%zmm15,%zmm3,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm3,%zmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_6 +.L_AAD_blocks_6_6: + subq $512,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%ymm3{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %ymm16,%ymm3,%ymm3 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 256(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13 + vmovdqu64 320(%rdi),%ymm15 + vpclmulqdq $0x01,%ymm15,%ymm3,%ymm7 + vpclmulqdq $0x10,%ymm15,%ymm3,%ymm8 + vpclmulqdq $0x11,%ymm15,%ymm3,%ymm1 + vpclmulqdq $0x00,%ymm15,%ymm3,%ymm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_6 +.L_AAD_blocks_5_6: + subq $512,%r12 + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11 + vmovdqu8 64(%r10),%xmm3{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %xmm16,%xmm3,%xmm3 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 272(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13 + vmovdqu64 336(%rdi),%xmm15 + vpclmulqdq $0x01,%xmm15,%xmm3,%xmm7 + vpclmulqdq $0x10,%xmm15,%xmm3,%xmm8 + vpclmulqdq $0x11,%xmm15,%xmm3,%xmm1 + vpclmulqdq $0x00,%xmm15,%xmm3,%xmm6 + + vpxorq %zmm12,%zmm7,%zmm7 + vpxorq %zmm13,%zmm8,%zmm8 + vpxorq %zmm9,%zmm1,%zmm1 + vpxorq %zmm10,%zmm6,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_6 +.L_AAD_blocks_4_6: + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 288(%rdi),%zmm15 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13 + + vpxorq %zmm13,%zmm12,%zmm12 + vpsrldq $8,%zmm12,%zmm7 + vpslldq $8,%zmm12,%zmm8 + vpxorq %zmm7,%zmm9,%zmm1 + vpxorq %zmm8,%zmm10,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_6 +.L_AAD_blocks_3_6: + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%zmm11{%k1}{z} + vpshufb %zmm16,%zmm11,%zmm11 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 304(%rdi),%ymm15 + vinserti64x2 $2,336(%rdi),%zmm15,%zmm15 + vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7 + vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8 + vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_6 +.L_AAD_blocks_2_6: + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%ymm11{%k1}{z} + vpshufb %ymm16,%ymm11,%ymm11 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 320(%rdi),%ymm15 + vpclmulqdq $0x01,%ymm15,%ymm11,%ymm7 + vpclmulqdq $0x10,%ymm15,%ymm11,%ymm8 + vpclmulqdq $0x11,%ymm15,%ymm11,%ymm1 + vpclmulqdq $0x00,%ymm15,%ymm11,%ymm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + + jmp .L_CALC_AAD_done_6 +.L_AAD_blocks_1_6: + kmovq (%r12),%k1 + vmovdqu8 0(%r10),%xmm11{%k1}{z} + vpshufb %xmm16,%xmm11,%xmm11 + vpxorq %zmm14,%zmm11,%zmm11 + vmovdqu64 336(%rdi),%xmm15 + vpclmulqdq $0x01,%xmm15,%xmm11,%xmm7 + vpclmulqdq $0x10,%xmm15,%xmm11,%xmm8 + vpclmulqdq $0x11,%xmm15,%xmm11,%xmm1 + vpclmulqdq $0x00,%xmm15,%xmm11,%xmm6 + + vpxorq %zmm8,%zmm7,%zmm7 + vpsrldq $8,%zmm7,%zmm12 + vpslldq $8,%zmm7,%zmm13 + vpxorq %zmm12,%zmm1,%zmm1 + vpxorq %zmm13,%zmm6,%zmm6 + vextracti64x4 $1,%zmm1,%ymm12 + vpxorq %ymm12,%ymm1,%ymm1 + vextracti32x4 $1,%ymm1,%xmm12 + vpxorq %xmm12,%xmm1,%xmm1 + vextracti64x4 $1,%zmm6,%ymm13 + vpxorq %ymm13,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm13 + vpxorq %xmm13,%xmm6,%xmm6 + vmovdqa64 POLY2(%rip),%xmm15 + + + vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7 + vpslldq $8,%xmm7,%xmm7 + vpxorq %xmm7,%xmm6,%xmm7 + + + vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8 + vpsrldq $4,%xmm8,%xmm8 + vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm1,%xmm8,%xmm14 + +.L_CALC_AAD_done_6: + vmovdqu64 %xmm14,64(%rdi) + cmpq $256,%rdx + jbe .Lskip_hkeys_cleanup_9 + vpxor %xmm0,%xmm0,%xmm0 + vmovdqa64 %zmm0,0(%rsp) + vmovdqa64 %zmm0,64(%rsp) + vmovdqa64 %zmm0,128(%rsp) + vmovdqa64 %zmm0,192(%rsp) + vmovdqa64 %zmm0,256(%rsp) + vmovdqa64 %zmm0,320(%rsp) + vmovdqa64 %zmm0,384(%rsp) + vmovdqa64 %zmm0,448(%rsp) + vmovdqa64 %zmm0,512(%rsp) + vmovdqa64 %zmm0,576(%rsp) + vmovdqa64 %zmm0,640(%rsp) + vmovdqa64 %zmm0,704(%rsp) +.Lskip_hkeys_cleanup_9: + vzeroupper + leaq (%rbp),%rsp +.cfi_def_cfa_register %rsp + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx +.Lexit_update_aad: + .byte 0xf3,0xc3 +.Lghash_seh_end: +.cfi_endproc +.size ossl_aes_gcm_update_aad_avx512, .-ossl_aes_gcm_update_aad_avx512 +.globl ossl_aes_gcm_encrypt_avx512 +.type ossl_aes_gcm_encrypt_avx512,@function +.align 32 +ossl_aes_gcm_encrypt_avx512: +.cfi_startproc +.Lencrypt_seh_begin: +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 +.Lencrypt_seh_push_rbx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 +.Lencrypt_seh_push_rbp: + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 +.Lencrypt_seh_push_r12: + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 +.Lencrypt_seh_push_r13: + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 +.Lencrypt_seh_push_r14: + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lencrypt_seh_push_r15: + + + + + + + + + + + leaq 0(%rsp),%rbp +.cfi_def_cfa_register %rbp +.Lencrypt_seh_setfp: + +.Lencrypt_seh_prolog_end: + subq $1588,%rsp + andq $(-64),%rsp + + + movl 240(%rdi),%eax + cmpl $9,%eax + je .Laes_gcm_encrypt_128_avx512 + cmpl $11,%eax + je .Laes_gcm_encrypt_192_avx512 + cmpl $13,%eax + je .Laes_gcm_encrypt_256_avx512 + xorl %eax,%eax + jmp .Lexit_gcm_encrypt +.align 32 +.Laes_gcm_encrypt_128_avx512: + orq %r8,%r8 + je .L_enc_dec_done_10 + xorq %r14,%r14 + vmovdqu64 64(%rsi),%xmm14 + + movq (%rdx),%r11 + orq %r11,%r11 + je .L_partial_block_done_11 + movl $16,%r10d + leaq byte_len_to_mask_table(%rip),%r12 + cmpq %r10,%r8 + cmovcq %r8,%r10 + kmovw (%r12,%r10,2),%k1 + vmovdqu8 (%rcx),%xmm0{%k1}{z} + + vmovdqu64 16(%rsi),%xmm3 + vmovdqu64 336(%rsi),%xmm4 + + + + leaq SHIFT_MASK(%rip),%r12 + addq %r11,%r12 + vmovdqu64 (%r12),%xmm5 + vpshufb %xmm5,%xmm3,%xmm3 + vpxorq %xmm0,%xmm3,%xmm3 + + + leaq (%r8,%r11,1),%r13 + subq $16,%r13 + jge .L_no_extra_mask_11 + subq %r13,%r12 +.L_no_extra_mask_11: + + + + vmovdqu64 16(%r12),%xmm0 + vpand %xmm0,%xmm3,%xmm3 + vpshufb SHUF_MASK(%rip),%xmm3,%xmm3 + vpshufb %xmm5,%xmm3,%xmm3 + vpxorq %xmm3,%xmm14,%xmm14 + cmpq $0,%r13 + jl .L_partial_incomplete_11 + + vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7 + vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10 + vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11 + vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14 + vpxorq %xmm11,%xmm14,%xmm14 + + vpsrldq $8,%xmm14,%xmm11 + vpslldq $8,%xmm14,%xmm14 + vpxorq %xmm11,%xmm7,%xmm7 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vmovdqu64 POLY2(%rip),%xmm11 + + vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10 + vpslldq $8,%xmm10,%xmm10 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10 + vpsrldq $4,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14 + vpslldq $4,%xmm14,%xmm14 + + vpternlogq $0x96,%xmm10,%xmm7,%xmm14 + + movq $0,(%rdx) + + movq %r11,%r12 + movq $16,%r11 + subq %r12,%r11 + jmp .L_enc_dec_done_11 + +.L_partial_incomplete_11: + addq %r8,(%rdx) + movq %r8,%r11 + +.L_enc_dec_done_11: + + + leaq byte_len_to_mask_table(%rip),%r12 + kmovw (%r12,%r11,2),%k1 + vmovdqu64 %xmm14,64(%rsi) + + vpshufb SHUF_MASK(%rip),%xmm3,%xmm3 + vpshufb %xmm5,%xmm3,%xmm3 + movq %r9,%r12 + vmovdqu8 %xmm3,(%r12){%k1} +.L_partial_block_done_11: + vmovdqu64 0(%rsi),%xmm2 + subq %r11,%r8 + je .L_enc_dec_done_10 + cmpq $256,%r8 + jbe .L_message_below_equal_16_blocks_10 + + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vmovdqa64 ddq_addbe_4444(%rip),%zmm27 + vmovdqa64 ddq_addbe_1234(%rip),%zmm28 + + + + + + + vmovd %xmm2,%r15d + andl $255,%r15d + + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpshufb %zmm29,%zmm2,%zmm2 + + + + cmpb $240,%r15b + jae .L_next_16_overflow_12 + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_12 +.L_next_16_overflow_12: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_12: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 0(%rcx,%r11,1),%zmm0 + vmovdqu8 64(%rcx,%r11,1),%zmm3 + vmovdqu8 128(%rcx,%r11,1),%zmm4 + vmovdqu8 192(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,0(%r10,%r11,1) + vmovdqu8 %zmm10,64(%r10,%r11,1) + vmovdqu8 %zmm11,128(%r10,%r11,1) + vmovdqu8 %zmm12,192(%r10,%r11,1) + + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 + vmovdqa64 %zmm7,768(%rsp) + vmovdqa64 %zmm10,832(%rsp) + vmovdqa64 %zmm11,896(%rsp) + vmovdqa64 %zmm12,960(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_13 + + vmovdqu64 288(%rsi),%zmm0 + vmovdqu64 %zmm0,704(%rsp) + + vmovdqu64 224(%rsi),%zmm3 + vmovdqu64 %zmm3,640(%rsp) + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 160(%rsi),%zmm4 + vmovdqu64 %zmm4,576(%rsp) + + vmovdqu64 96(%rsi),%zmm5 + vmovdqu64 %zmm5,512(%rsp) +.L_skip_hkeys_precomputation_13: + cmpq $512,%r8 + jb .L_message_below_32_blocks_10 + + + + cmpb $240,%r15b + jae .L_next_16_overflow_14 + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_14 +.L_next_16_overflow_14: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_14: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 256(%rcx,%r11,1),%zmm0 + vmovdqu8 320(%rcx,%r11,1),%zmm3 + vmovdqu8 384(%rcx,%r11,1),%zmm4 + vmovdqu8 448(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,256(%r10,%r11,1) + vmovdqu8 %zmm10,320(%r10,%r11,1) + vmovdqu8 %zmm11,384(%r10,%r11,1) + vmovdqu8 %zmm12,448(%r10,%r11,1) + + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 + vmovdqa64 %zmm7,1024(%rsp) + vmovdqa64 %zmm10,1088(%rsp) + vmovdqa64 %zmm11,1152(%rsp) + vmovdqa64 %zmm12,1216(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_15 + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,192(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,128(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,64(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,0(%rsp) +.L_skip_hkeys_precomputation_15: + movq $1,%r14 + addq $512,%r11 + subq $512,%r8 + + cmpq $768,%r8 + jb .L_no_more_big_nblocks_10 +.L_encrypt_big_nblocks_10: + cmpb $240,%r15b + jae .L_16_blocks_overflow_16 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_16 +.L_16_blocks_overflow_16: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_16: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_17 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_17 +.L_16_blocks_overflow_17: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_17: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_18 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_18 +.L_16_blocks_overflow_18: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_18: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 512(%rcx,%r11,1),%zmm17 + vmovdqu8 576(%rcx,%r11,1),%zmm19 + vmovdqu8 640(%rcx,%r11,1),%zmm20 + vmovdqu8 704(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + + + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpternlogq $0x96,%zmm15,%zmm12,%zmm6 + vpxorq %zmm24,%zmm6,%zmm6 + vpternlogq $0x96,%zmm10,%zmm13,%zmm7 + vpxorq %zmm25,%zmm7,%zmm7 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vextracti64x4 $1,%zmm6,%ymm12 + vpxorq %ymm12,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm12 + vpxorq %xmm12,%xmm6,%xmm6 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm6 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,512(%r10,%r11,1) + vmovdqu8 %zmm3,576(%r10,%r11,1) + vmovdqu8 %zmm4,640(%r10,%r11,1) + vmovdqu8 %zmm5,704(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1024(%rsp) + vmovdqa64 %zmm3,1088(%rsp) + vmovdqa64 %zmm4,1152(%rsp) + vmovdqa64 %zmm5,1216(%rsp) + vmovdqa64 %zmm6,%zmm14 + + addq $768,%r11 + subq $768,%r8 + cmpq $768,%r8 + jae .L_encrypt_big_nblocks_10 + +.L_no_more_big_nblocks_10: + + cmpq $512,%r8 + jae .L_encrypt_32_blocks_10 + + cmpq $256,%r8 + jae .L_encrypt_16_blocks_10 +.L_encrypt_0_blocks_ghash_32_10: + movl %r8d,%r10d + andl $~15,%r10d + movl $256,%ebx + subl %r10d,%ebx + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + addl $256,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_19 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_19 + jb .L_last_num_blocks_is_7_1_19 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_19 + jb .L_last_num_blocks_is_11_9_19 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_19 + ja .L_last_num_blocks_is_16_19 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_19 + jmp .L_last_num_blocks_is_13_19 + +.L_last_num_blocks_is_11_9_19: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_19 + ja .L_last_num_blocks_is_11_19 + jmp .L_last_num_blocks_is_9_19 + +.L_last_num_blocks_is_7_1_19: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_19 + jb .L_last_num_blocks_is_3_1_19 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_19 + je .L_last_num_blocks_is_6_19 + jmp .L_last_num_blocks_is_5_19 + +.L_last_num_blocks_is_3_1_19: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_19 + je .L_last_num_blocks_is_2_19 +.L_last_num_blocks_is_1_19: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_20 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_20 + +.L_16_blocks_overflow_20: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_20: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_21 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_21 +.L_small_initial_partial_block_21: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_21 +.L_small_initial_compute_done_21: +.L_after_reduction_21: + jmp .L_last_blocks_done_19 +.L_last_num_blocks_is_2_19: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_22 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_22 + +.L_16_blocks_overflow_22: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_22: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_23 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_23 +.L_small_initial_partial_block_23: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_23: + + orq %r8,%r8 + je .L_after_reduction_23 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_23: + jmp .L_last_blocks_done_19 +.L_last_num_blocks_is_3_19: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_24 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_24 + +.L_16_blocks_overflow_24: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_24: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_25 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_25 +.L_small_initial_partial_block_25: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_25: + + orq %r8,%r8 + je .L_after_reduction_25 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_25: + jmp .L_last_blocks_done_19 +.L_last_num_blocks_is_4_19: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_26 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_26 + +.L_16_blocks_overflow_26: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_26: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_27 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_27 +.L_small_initial_partial_block_27: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_27: + + orq %r8,%r8 + je .L_after_reduction_27 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_27: + jmp .L_last_blocks_done_19 +.L_last_num_blocks_is_5_19: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_28 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_28 + +.L_16_blocks_overflow_28: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_28: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_29 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_29 +.L_small_initial_partial_block_29: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_29: + + orq %r8,%r8 + je .L_after_reduction_29 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_29: + jmp .L_last_blocks_done_19 +.L_last_num_blocks_is_6_19: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_30 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_30 + +.L_16_blocks_overflow_30: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_30: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_31 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_31 +.L_small_initial_partial_block_31: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_31: + + orq %r8,%r8 + je .L_after_reduction_31 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_31: + jmp .L_last_blocks_done_19 +.L_last_num_blocks_is_7_19: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_32 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_32 + +.L_16_blocks_overflow_32: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_32: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_33 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_33 +.L_small_initial_partial_block_33: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_33: + + orq %r8,%r8 + je .L_after_reduction_33 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_33: + jmp .L_last_blocks_done_19 +.L_last_num_blocks_is_8_19: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_34 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_34 + +.L_16_blocks_overflow_34: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_34: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_35 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_35 +.L_small_initial_partial_block_35: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_35: + + orq %r8,%r8 + je .L_after_reduction_35 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_35: + jmp .L_last_blocks_done_19 +.L_last_num_blocks_is_9_19: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_36 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_36 + +.L_16_blocks_overflow_36: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_36: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_37 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_37 +.L_small_initial_partial_block_37: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_37: + + orq %r8,%r8 + je .L_after_reduction_37 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_37: + jmp .L_last_blocks_done_19 +.L_last_num_blocks_is_10_19: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_38 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_38 + +.L_16_blocks_overflow_38: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_38: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_39 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_39 +.L_small_initial_partial_block_39: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_39: + + orq %r8,%r8 + je .L_after_reduction_39 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_39: + jmp .L_last_blocks_done_19 +.L_last_num_blocks_is_11_19: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_40 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_40 + +.L_16_blocks_overflow_40: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_40: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_41 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_41 +.L_small_initial_partial_block_41: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_41: + + orq %r8,%r8 + je .L_after_reduction_41 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_41: + jmp .L_last_blocks_done_19 +.L_last_num_blocks_is_12_19: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_42 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_42 + +.L_16_blocks_overflow_42: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_42: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_43 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_43 +.L_small_initial_partial_block_43: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_43: + + orq %r8,%r8 + je .L_after_reduction_43 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_43: + jmp .L_last_blocks_done_19 +.L_last_num_blocks_is_13_19: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_44 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_44 + +.L_16_blocks_overflow_44: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_44: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_45 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_45 +.L_small_initial_partial_block_45: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_45: + + orq %r8,%r8 + je .L_after_reduction_45 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_45: + jmp .L_last_blocks_done_19 +.L_last_num_blocks_is_14_19: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_46 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_46 + +.L_16_blocks_overflow_46: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_46: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_47 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_47 +.L_small_initial_partial_block_47: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_47: + + orq %r8,%r8 + je .L_after_reduction_47 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_47: + jmp .L_last_blocks_done_19 +.L_last_num_blocks_is_15_19: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_48 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_48 + +.L_16_blocks_overflow_48: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_48: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_49 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_49 +.L_small_initial_partial_block_49: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_49: + + orq %r8,%r8 + je .L_after_reduction_49 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_49: + jmp .L_last_blocks_done_19 +.L_last_num_blocks_is_16_19: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_50 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_50 + +.L_16_blocks_overflow_50: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_50: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_51: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_51: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_51: + jmp .L_last_blocks_done_19 +.L_last_num_blocks_is_0_19: + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_19: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_10 +.L_encrypt_32_blocks_10: + cmpb $240,%r15b + jae .L_16_blocks_overflow_52 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_52 +.L_16_blocks_overflow_52: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_52: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_53 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_53 +.L_16_blocks_overflow_53: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_53: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + + subq $512,%r8 + addq $512,%r11 + movl %r8d,%r10d + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_54 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_54 + jb .L_last_num_blocks_is_7_1_54 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_54 + jb .L_last_num_blocks_is_11_9_54 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_54 + ja .L_last_num_blocks_is_16_54 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_54 + jmp .L_last_num_blocks_is_13_54 + +.L_last_num_blocks_is_11_9_54: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_54 + ja .L_last_num_blocks_is_11_54 + jmp .L_last_num_blocks_is_9_54 + +.L_last_num_blocks_is_7_1_54: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_54 + jb .L_last_num_blocks_is_3_1_54 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_54 + je .L_last_num_blocks_is_6_54 + jmp .L_last_num_blocks_is_5_54 + +.L_last_num_blocks_is_3_1_54: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_54 + je .L_last_num_blocks_is_2_54 +.L_last_num_blocks_is_1_54: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_55 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_55 + +.L_16_blocks_overflow_55: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_55: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_56 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_56 +.L_small_initial_partial_block_56: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_56 +.L_small_initial_compute_done_56: +.L_after_reduction_56: + jmp .L_last_blocks_done_54 +.L_last_num_blocks_is_2_54: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_57 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_57 + +.L_16_blocks_overflow_57: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_57: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_58 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_58 +.L_small_initial_partial_block_58: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_58: + + orq %r8,%r8 + je .L_after_reduction_58 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_58: + jmp .L_last_blocks_done_54 +.L_last_num_blocks_is_3_54: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_59 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_59 + +.L_16_blocks_overflow_59: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_59: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_60 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_60 +.L_small_initial_partial_block_60: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_60: + + orq %r8,%r8 + je .L_after_reduction_60 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_60: + jmp .L_last_blocks_done_54 +.L_last_num_blocks_is_4_54: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_61 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_61 + +.L_16_blocks_overflow_61: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_61: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_62 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_62 +.L_small_initial_partial_block_62: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_62: + + orq %r8,%r8 + je .L_after_reduction_62 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_62: + jmp .L_last_blocks_done_54 +.L_last_num_blocks_is_5_54: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_63 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_63 + +.L_16_blocks_overflow_63: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_63: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_64 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_64 +.L_small_initial_partial_block_64: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_64: + + orq %r8,%r8 + je .L_after_reduction_64 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_64: + jmp .L_last_blocks_done_54 +.L_last_num_blocks_is_6_54: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_65 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_65 + +.L_16_blocks_overflow_65: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_65: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_66 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_66 +.L_small_initial_partial_block_66: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_66: + + orq %r8,%r8 + je .L_after_reduction_66 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_66: + jmp .L_last_blocks_done_54 +.L_last_num_blocks_is_7_54: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_67 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_67 + +.L_16_blocks_overflow_67: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_67: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_68 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_68 +.L_small_initial_partial_block_68: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_68: + + orq %r8,%r8 + je .L_after_reduction_68 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_68: + jmp .L_last_blocks_done_54 +.L_last_num_blocks_is_8_54: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_69 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_69 + +.L_16_blocks_overflow_69: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_69: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_70 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_70 +.L_small_initial_partial_block_70: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_70: + + orq %r8,%r8 + je .L_after_reduction_70 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_70: + jmp .L_last_blocks_done_54 +.L_last_num_blocks_is_9_54: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_71 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_71 + +.L_16_blocks_overflow_71: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_71: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_72 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_72 +.L_small_initial_partial_block_72: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_72: + + orq %r8,%r8 + je .L_after_reduction_72 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_72: + jmp .L_last_blocks_done_54 +.L_last_num_blocks_is_10_54: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_73 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_73 + +.L_16_blocks_overflow_73: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_73: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_74 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_74 +.L_small_initial_partial_block_74: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_74: + + orq %r8,%r8 + je .L_after_reduction_74 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_74: + jmp .L_last_blocks_done_54 +.L_last_num_blocks_is_11_54: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_75 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_75 + +.L_16_blocks_overflow_75: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_75: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_76 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_76 +.L_small_initial_partial_block_76: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_76: + + orq %r8,%r8 + je .L_after_reduction_76 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_76: + jmp .L_last_blocks_done_54 +.L_last_num_blocks_is_12_54: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_77 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_77 + +.L_16_blocks_overflow_77: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_77: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_78 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_78 +.L_small_initial_partial_block_78: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_78: + + orq %r8,%r8 + je .L_after_reduction_78 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_78: + jmp .L_last_blocks_done_54 +.L_last_num_blocks_is_13_54: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_79 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_79 + +.L_16_blocks_overflow_79: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_79: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_80 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_80 +.L_small_initial_partial_block_80: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_80: + + orq %r8,%r8 + je .L_after_reduction_80 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_80: + jmp .L_last_blocks_done_54 +.L_last_num_blocks_is_14_54: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_81 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_81 + +.L_16_blocks_overflow_81: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_81: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_82 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_82 +.L_small_initial_partial_block_82: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_82: + + orq %r8,%r8 + je .L_after_reduction_82 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_82: + jmp .L_last_blocks_done_54 +.L_last_num_blocks_is_15_54: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_83 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_83 + +.L_16_blocks_overflow_83: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_83: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_84 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_84 +.L_small_initial_partial_block_84: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_84: + + orq %r8,%r8 + je .L_after_reduction_84 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_84: + jmp .L_last_blocks_done_54 +.L_last_num_blocks_is_16_54: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_85 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_85 + +.L_16_blocks_overflow_85: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_85: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_86: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_86: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_86: + jmp .L_last_blocks_done_54 +.L_last_num_blocks_is_0_54: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_54: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_10 +.L_encrypt_16_blocks_10: + cmpb $240,%r15b + jae .L_16_blocks_overflow_87 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_87 +.L_16_blocks_overflow_87: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_87: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 256(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 320(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 384(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 448(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_88 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_88 + jb .L_last_num_blocks_is_7_1_88 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_88 + jb .L_last_num_blocks_is_11_9_88 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_88 + ja .L_last_num_blocks_is_16_88 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_88 + jmp .L_last_num_blocks_is_13_88 + +.L_last_num_blocks_is_11_9_88: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_88 + ja .L_last_num_blocks_is_11_88 + jmp .L_last_num_blocks_is_9_88 + +.L_last_num_blocks_is_7_1_88: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_88 + jb .L_last_num_blocks_is_3_1_88 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_88 + je .L_last_num_blocks_is_6_88 + jmp .L_last_num_blocks_is_5_88 + +.L_last_num_blocks_is_3_1_88: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_88 + je .L_last_num_blocks_is_2_88 +.L_last_num_blocks_is_1_88: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_89 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_89 + +.L_16_blocks_overflow_89: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_89: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %xmm31,%xmm0,%xmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_90 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_90 +.L_small_initial_partial_block_90: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_90 +.L_small_initial_compute_done_90: +.L_after_reduction_90: + jmp .L_last_blocks_done_88 +.L_last_num_blocks_is_2_88: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_91 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_91 + +.L_16_blocks_overflow_91: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_91: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %ymm31,%ymm0,%ymm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_92 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_92 +.L_small_initial_partial_block_92: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_92: + + orq %r8,%r8 + je .L_after_reduction_92 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_92: + jmp .L_last_blocks_done_88 +.L_last_num_blocks_is_3_88: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_93 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_93 + +.L_16_blocks_overflow_93: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_93: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_94 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_94 +.L_small_initial_partial_block_94: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_94: + + orq %r8,%r8 + je .L_after_reduction_94 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_94: + jmp .L_last_blocks_done_88 +.L_last_num_blocks_is_4_88: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_95 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_95 + +.L_16_blocks_overflow_95: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_95: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_96 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_96 +.L_small_initial_partial_block_96: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_96: + + orq %r8,%r8 + je .L_after_reduction_96 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_96: + jmp .L_last_blocks_done_88 +.L_last_num_blocks_is_5_88: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_97 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_97 + +.L_16_blocks_overflow_97: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_97: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_98 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_98 +.L_small_initial_partial_block_98: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_98: + + orq %r8,%r8 + je .L_after_reduction_98 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_98: + jmp .L_last_blocks_done_88 +.L_last_num_blocks_is_6_88: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_99 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_99 + +.L_16_blocks_overflow_99: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_99: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_100 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_100 +.L_small_initial_partial_block_100: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_100: + + orq %r8,%r8 + je .L_after_reduction_100 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_100: + jmp .L_last_blocks_done_88 +.L_last_num_blocks_is_7_88: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_101 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_101 + +.L_16_blocks_overflow_101: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_101: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_102 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_102 +.L_small_initial_partial_block_102: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_102: + + orq %r8,%r8 + je .L_after_reduction_102 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_102: + jmp .L_last_blocks_done_88 +.L_last_num_blocks_is_8_88: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_103 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_103 + +.L_16_blocks_overflow_103: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_103: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_104 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_104 +.L_small_initial_partial_block_104: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_104: + + orq %r8,%r8 + je .L_after_reduction_104 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_104: + jmp .L_last_blocks_done_88 +.L_last_num_blocks_is_9_88: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_105 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_105 + +.L_16_blocks_overflow_105: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_105: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_106 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_106 +.L_small_initial_partial_block_106: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_106: + + orq %r8,%r8 + je .L_after_reduction_106 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_106: + jmp .L_last_blocks_done_88 +.L_last_num_blocks_is_10_88: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_107 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_107 + +.L_16_blocks_overflow_107: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_107: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_108 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_108 +.L_small_initial_partial_block_108: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_108: + + orq %r8,%r8 + je .L_after_reduction_108 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_108: + jmp .L_last_blocks_done_88 +.L_last_num_blocks_is_11_88: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_109 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_109 + +.L_16_blocks_overflow_109: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_109: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_110 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_110 +.L_small_initial_partial_block_110: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_110: + + orq %r8,%r8 + je .L_after_reduction_110 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_110: + jmp .L_last_blocks_done_88 +.L_last_num_blocks_is_12_88: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_111 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_111 + +.L_16_blocks_overflow_111: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_111: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_112 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_112 +.L_small_initial_partial_block_112: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_112: + + orq %r8,%r8 + je .L_after_reduction_112 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_112: + jmp .L_last_blocks_done_88 +.L_last_num_blocks_is_13_88: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_113 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_113 + +.L_16_blocks_overflow_113: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_113: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_114 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_114 +.L_small_initial_partial_block_114: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_114: + + orq %r8,%r8 + je .L_after_reduction_114 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_114: + jmp .L_last_blocks_done_88 +.L_last_num_blocks_is_14_88: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_115 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_115 + +.L_16_blocks_overflow_115: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_115: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_116 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_116 +.L_small_initial_partial_block_116: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_116: + + orq %r8,%r8 + je .L_after_reduction_116 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_116: + jmp .L_last_blocks_done_88 +.L_last_num_blocks_is_15_88: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_117 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_117 + +.L_16_blocks_overflow_117: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_117: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_118 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_118 +.L_small_initial_partial_block_118: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_118: + + orq %r8,%r8 + je .L_after_reduction_118 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_118: + jmp .L_last_blocks_done_88 +.L_last_num_blocks_is_16_88: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_119 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_119 + +.L_16_blocks_overflow_119: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_119: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_120: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_120: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_120: + jmp .L_last_blocks_done_88 +.L_last_num_blocks_is_0_88: + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_88: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_10 + +.L_message_below_32_blocks_10: + + + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_121 + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) +.L_skip_hkeys_precomputation_121: + movq $1,%r14 + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_122 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_122 + jb .L_last_num_blocks_is_7_1_122 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_122 + jb .L_last_num_blocks_is_11_9_122 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_122 + ja .L_last_num_blocks_is_16_122 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_122 + jmp .L_last_num_blocks_is_13_122 + +.L_last_num_blocks_is_11_9_122: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_122 + ja .L_last_num_blocks_is_11_122 + jmp .L_last_num_blocks_is_9_122 + +.L_last_num_blocks_is_7_1_122: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_122 + jb .L_last_num_blocks_is_3_1_122 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_122 + je .L_last_num_blocks_is_6_122 + jmp .L_last_num_blocks_is_5_122 + +.L_last_num_blocks_is_3_1_122: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_122 + je .L_last_num_blocks_is_2_122 +.L_last_num_blocks_is_1_122: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_123 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_123 + +.L_16_blocks_overflow_123: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_123: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_124 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_124 +.L_small_initial_partial_block_124: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_124 +.L_small_initial_compute_done_124: +.L_after_reduction_124: + jmp .L_last_blocks_done_122 +.L_last_num_blocks_is_2_122: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_125 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_125 + +.L_16_blocks_overflow_125: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_125: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_126 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_126 +.L_small_initial_partial_block_126: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_126: + + orq %r8,%r8 + je .L_after_reduction_126 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_126: + jmp .L_last_blocks_done_122 +.L_last_num_blocks_is_3_122: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_127 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_127 + +.L_16_blocks_overflow_127: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_127: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_128 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_128 +.L_small_initial_partial_block_128: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_128: + + orq %r8,%r8 + je .L_after_reduction_128 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_128: + jmp .L_last_blocks_done_122 +.L_last_num_blocks_is_4_122: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_129 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_129 + +.L_16_blocks_overflow_129: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_129: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_130 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_130 +.L_small_initial_partial_block_130: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_130: + + orq %r8,%r8 + je .L_after_reduction_130 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_130: + jmp .L_last_blocks_done_122 +.L_last_num_blocks_is_5_122: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_131 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_131 + +.L_16_blocks_overflow_131: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_131: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_132 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_132 +.L_small_initial_partial_block_132: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_132: + + orq %r8,%r8 + je .L_after_reduction_132 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_132: + jmp .L_last_blocks_done_122 +.L_last_num_blocks_is_6_122: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_133 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_133 + +.L_16_blocks_overflow_133: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_133: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_134 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_134 +.L_small_initial_partial_block_134: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_134: + + orq %r8,%r8 + je .L_after_reduction_134 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_134: + jmp .L_last_blocks_done_122 +.L_last_num_blocks_is_7_122: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_135 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_135 + +.L_16_blocks_overflow_135: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_135: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_136 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_136 +.L_small_initial_partial_block_136: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_136: + + orq %r8,%r8 + je .L_after_reduction_136 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_136: + jmp .L_last_blocks_done_122 +.L_last_num_blocks_is_8_122: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_137 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_137 + +.L_16_blocks_overflow_137: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_137: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_138 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_138 +.L_small_initial_partial_block_138: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_138: + + orq %r8,%r8 + je .L_after_reduction_138 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_138: + jmp .L_last_blocks_done_122 +.L_last_num_blocks_is_9_122: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_139 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_139 + +.L_16_blocks_overflow_139: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_139: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_140 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_140 +.L_small_initial_partial_block_140: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_140: + + orq %r8,%r8 + je .L_after_reduction_140 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_140: + jmp .L_last_blocks_done_122 +.L_last_num_blocks_is_10_122: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_141 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_141 + +.L_16_blocks_overflow_141: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_141: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_142 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_142 +.L_small_initial_partial_block_142: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_142: + + orq %r8,%r8 + je .L_after_reduction_142 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_142: + jmp .L_last_blocks_done_122 +.L_last_num_blocks_is_11_122: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_143 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_143 + +.L_16_blocks_overflow_143: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_143: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_144 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_144 +.L_small_initial_partial_block_144: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_144: + + orq %r8,%r8 + je .L_after_reduction_144 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_144: + jmp .L_last_blocks_done_122 +.L_last_num_blocks_is_12_122: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_145 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_145 + +.L_16_blocks_overflow_145: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_145: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_146 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_146 +.L_small_initial_partial_block_146: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_146: + + orq %r8,%r8 + je .L_after_reduction_146 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_146: + jmp .L_last_blocks_done_122 +.L_last_num_blocks_is_13_122: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_147 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_147 + +.L_16_blocks_overflow_147: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_147: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_148 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_148 +.L_small_initial_partial_block_148: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_148: + + orq %r8,%r8 + je .L_after_reduction_148 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_148: + jmp .L_last_blocks_done_122 +.L_last_num_blocks_is_14_122: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_149 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_149 + +.L_16_blocks_overflow_149: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_149: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_150 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_150 +.L_small_initial_partial_block_150: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_150: + + orq %r8,%r8 + je .L_after_reduction_150 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_150: + jmp .L_last_blocks_done_122 +.L_last_num_blocks_is_15_122: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_151 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_151 + +.L_16_blocks_overflow_151: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_151: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_152 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_152 +.L_small_initial_partial_block_152: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_152: + + orq %r8,%r8 + je .L_after_reduction_152 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_152: + jmp .L_last_blocks_done_122 +.L_last_num_blocks_is_16_122: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_153 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_153 + +.L_16_blocks_overflow_153: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_153: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_154: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_154: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_154: + jmp .L_last_blocks_done_122 +.L_last_num_blocks_is_0_122: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_122: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_10 + +.L_message_below_equal_16_blocks_10: + + + movl %r8d,%r12d + addl $15,%r12d + shrl $4,%r12d + cmpq $8,%r12 + je .L_small_initial_num_blocks_is_8_155 + jl .L_small_initial_num_blocks_is_7_1_155 + + + cmpq $12,%r12 + je .L_small_initial_num_blocks_is_12_155 + jl .L_small_initial_num_blocks_is_11_9_155 + + + cmpq $16,%r12 + je .L_small_initial_num_blocks_is_16_155 + cmpq $15,%r12 + je .L_small_initial_num_blocks_is_15_155 + cmpq $14,%r12 + je .L_small_initial_num_blocks_is_14_155 + jmp .L_small_initial_num_blocks_is_13_155 + +.L_small_initial_num_blocks_is_11_9_155: + + cmpq $11,%r12 + je .L_small_initial_num_blocks_is_11_155 + cmpq $10,%r12 + je .L_small_initial_num_blocks_is_10_155 + jmp .L_small_initial_num_blocks_is_9_155 + +.L_small_initial_num_blocks_is_7_1_155: + cmpq $4,%r12 + je .L_small_initial_num_blocks_is_4_155 + jl .L_small_initial_num_blocks_is_3_1_155 + + cmpq $7,%r12 + je .L_small_initial_num_blocks_is_7_155 + cmpq $6,%r12 + je .L_small_initial_num_blocks_is_6_155 + jmp .L_small_initial_num_blocks_is_5_155 + +.L_small_initial_num_blocks_is_3_1_155: + + cmpq $3,%r12 + je .L_small_initial_num_blocks_is_3_155 + cmpq $2,%r12 + je .L_small_initial_num_blocks_is_2_155 + + + + + +.L_small_initial_num_blocks_is_1_155: + vmovdqa64 SHUF_MASK(%rip),%xmm29 + vpaddd ONE(%rip),%xmm2,%xmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm0,%xmm2 + vpshufb %xmm29,%xmm0,%xmm0 + vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %xmm15,%xmm0,%xmm0 + vpxorq %xmm6,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm6 + vextracti32x4 $0,%zmm6,%xmm13 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_156 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_156 +.L_small_initial_partial_block_156: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + + + + + + + + + + + + vpxorq %xmm13,%xmm14,%xmm14 + + jmp .L_after_reduction_156 +.L_small_initial_compute_done_156: +.L_after_reduction_156: + jmp .L_small_initial_blocks_encrypted_155 +.L_small_initial_num_blocks_is_2_155: + vmovdqa64 SHUF_MASK(%rip),%ymm29 + vshufi64x2 $0,%ymm2,%ymm2,%ymm0 + vpaddd ddq_add_1234(%rip),%ymm0,%ymm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm0,%xmm2 + vpshufb %ymm29,%ymm0,%ymm0 + vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %ymm15,%ymm0,%ymm0 + vpxorq %ymm6,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm6 + vextracti32x4 $1,%zmm6,%xmm13 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_157 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_157 +.L_small_initial_partial_block_157: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_157: + + orq %r8,%r8 + je .L_after_reduction_157 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_157: + jmp .L_small_initial_blocks_encrypted_155 +.L_small_initial_num_blocks_is_3_155: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vextracti32x4 $2,%zmm6,%xmm13 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_158 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_158 +.L_small_initial_partial_block_158: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_158: + + orq %r8,%r8 + je .L_after_reduction_158 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_158: + jmp .L_small_initial_blocks_encrypted_155 +.L_small_initial_num_blocks_is_4_155: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vextracti32x4 $3,%zmm6,%xmm13 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_159 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_159 +.L_small_initial_partial_block_159: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_159: + + orq %r8,%r8 + je .L_after_reduction_159 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_159: + jmp .L_small_initial_blocks_encrypted_155 +.L_small_initial_num_blocks_is_5_155: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %xmm15,%xmm3,%xmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %xmm7,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %xmm29,%xmm3,%xmm7 + vextracti32x4 $0,%zmm7,%xmm13 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_160 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_160 +.L_small_initial_partial_block_160: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_160: + + orq %r8,%r8 + je .L_after_reduction_160 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_160: + jmp .L_small_initial_blocks_encrypted_155 +.L_small_initial_num_blocks_is_6_155: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %ymm15,%ymm3,%ymm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %ymm7,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %ymm29,%ymm3,%ymm7 + vextracti32x4 $1,%zmm7,%xmm13 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_161 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_161 +.L_small_initial_partial_block_161: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_161: + + orq %r8,%r8 + je .L_after_reduction_161 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_161: + jmp .L_small_initial_blocks_encrypted_155 +.L_small_initial_num_blocks_is_7_155: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vextracti32x4 $2,%zmm7,%xmm13 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_162 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_162 +.L_small_initial_partial_block_162: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_162: + + orq %r8,%r8 + je .L_after_reduction_162 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_162: + jmp .L_small_initial_blocks_encrypted_155 +.L_small_initial_num_blocks_is_8_155: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vextracti32x4 $3,%zmm7,%xmm13 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_163 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_163 +.L_small_initial_partial_block_163: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_163: + + orq %r8,%r8 + je .L_after_reduction_163 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_163: + jmp .L_small_initial_blocks_encrypted_155 +.L_small_initial_num_blocks_is_9_155: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %xmm15,%xmm4,%xmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %xmm10,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %xmm29,%xmm4,%xmm10 + vextracti32x4 $0,%zmm10,%xmm13 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_164 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_164 +.L_small_initial_partial_block_164: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_164: + + orq %r8,%r8 + je .L_after_reduction_164 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_164: + jmp .L_small_initial_blocks_encrypted_155 +.L_small_initial_num_blocks_is_10_155: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %ymm15,%ymm4,%ymm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %ymm10,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %ymm29,%ymm4,%ymm10 + vextracti32x4 $1,%zmm10,%xmm13 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_165 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_165 +.L_small_initial_partial_block_165: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_165: + + orq %r8,%r8 + je .L_after_reduction_165 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_165: + jmp .L_small_initial_blocks_encrypted_155 +.L_small_initial_num_blocks_is_11_155: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vextracti32x4 $2,%zmm10,%xmm13 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_166 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_166 +.L_small_initial_partial_block_166: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_166: + + orq %r8,%r8 + je .L_after_reduction_166 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_166: + jmp .L_small_initial_blocks_encrypted_155 +.L_small_initial_num_blocks_is_12_155: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vextracti32x4 $3,%zmm10,%xmm13 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_167 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_167 +.L_small_initial_partial_block_167: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_167: + + orq %r8,%r8 + je .L_after_reduction_167 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_167: + jmp .L_small_initial_blocks_encrypted_155 +.L_small_initial_num_blocks_is_13_155: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %xmm15,%xmm5,%xmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %xmm11,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %xmm29,%xmm5,%xmm11 + vextracti32x4 $0,%zmm11,%xmm13 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_168 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_168 +.L_small_initial_partial_block_168: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_168: + + orq %r8,%r8 + je .L_after_reduction_168 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_168: + jmp .L_small_initial_blocks_encrypted_155 +.L_small_initial_num_blocks_is_14_155: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %ymm15,%ymm5,%ymm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %ymm11,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %ymm29,%ymm5,%ymm11 + vextracti32x4 $1,%zmm11,%xmm13 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_169 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_169 +.L_small_initial_partial_block_169: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_169: + + orq %r8,%r8 + je .L_after_reduction_169 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_169: + jmp .L_small_initial_blocks_encrypted_155 +.L_small_initial_num_blocks_is_15_155: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %zmm29,%zmm5,%zmm11 + vextracti32x4 $2,%zmm11,%xmm13 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_170 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_170 +.L_small_initial_partial_block_170: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_170: + + orq %r8,%r8 + je .L_after_reduction_170 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_170: + jmp .L_small_initial_blocks_encrypted_155 +.L_small_initial_num_blocks_is_16_155: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %zmm29,%zmm5,%zmm11 + vextracti32x4 $3,%zmm11,%xmm13 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_171: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_171: + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_171: +.L_small_initial_blocks_encrypted_155: +.L_ghash_done_10: + vmovdqu64 %xmm2,0(%rsi) + vmovdqu64 %xmm14,64(%rsi) +.L_enc_dec_done_10: + jmp .Lexit_gcm_encrypt +.align 32 +.Laes_gcm_encrypt_192_avx512: + orq %r8,%r8 + je .L_enc_dec_done_172 + xorq %r14,%r14 + vmovdqu64 64(%rsi),%xmm14 + + movq (%rdx),%r11 + orq %r11,%r11 + je .L_partial_block_done_173 + movl $16,%r10d + leaq byte_len_to_mask_table(%rip),%r12 + cmpq %r10,%r8 + cmovcq %r8,%r10 + kmovw (%r12,%r10,2),%k1 + vmovdqu8 (%rcx),%xmm0{%k1}{z} + + vmovdqu64 16(%rsi),%xmm3 + vmovdqu64 336(%rsi),%xmm4 + + + + leaq SHIFT_MASK(%rip),%r12 + addq %r11,%r12 + vmovdqu64 (%r12),%xmm5 + vpshufb %xmm5,%xmm3,%xmm3 + vpxorq %xmm0,%xmm3,%xmm3 + + + leaq (%r8,%r11,1),%r13 + subq $16,%r13 + jge .L_no_extra_mask_173 + subq %r13,%r12 +.L_no_extra_mask_173: + + + + vmovdqu64 16(%r12),%xmm0 + vpand %xmm0,%xmm3,%xmm3 + vpshufb SHUF_MASK(%rip),%xmm3,%xmm3 + vpshufb %xmm5,%xmm3,%xmm3 + vpxorq %xmm3,%xmm14,%xmm14 + cmpq $0,%r13 + jl .L_partial_incomplete_173 + + vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7 + vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10 + vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11 + vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14 + vpxorq %xmm11,%xmm14,%xmm14 + + vpsrldq $8,%xmm14,%xmm11 + vpslldq $8,%xmm14,%xmm14 + vpxorq %xmm11,%xmm7,%xmm7 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vmovdqu64 POLY2(%rip),%xmm11 + + vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10 + vpslldq $8,%xmm10,%xmm10 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10 + vpsrldq $4,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14 + vpslldq $4,%xmm14,%xmm14 + + vpternlogq $0x96,%xmm10,%xmm7,%xmm14 + + movq $0,(%rdx) + + movq %r11,%r12 + movq $16,%r11 + subq %r12,%r11 + jmp .L_enc_dec_done_173 + +.L_partial_incomplete_173: + addq %r8,(%rdx) + movq %r8,%r11 + +.L_enc_dec_done_173: + + + leaq byte_len_to_mask_table(%rip),%r12 + kmovw (%r12,%r11,2),%k1 + vmovdqu64 %xmm14,64(%rsi) + + vpshufb SHUF_MASK(%rip),%xmm3,%xmm3 + vpshufb %xmm5,%xmm3,%xmm3 + movq %r9,%r12 + vmovdqu8 %xmm3,(%r12){%k1} +.L_partial_block_done_173: + vmovdqu64 0(%rsi),%xmm2 + subq %r11,%r8 + je .L_enc_dec_done_172 + cmpq $256,%r8 + jbe .L_message_below_equal_16_blocks_172 + + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vmovdqa64 ddq_addbe_4444(%rip),%zmm27 + vmovdqa64 ddq_addbe_1234(%rip),%zmm28 + + + + + + + vmovd %xmm2,%r15d + andl $255,%r15d + + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpshufb %zmm29,%zmm2,%zmm2 + + + + cmpb $240,%r15b + jae .L_next_16_overflow_174 + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_174 +.L_next_16_overflow_174: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_174: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 0(%rcx,%r11,1),%zmm0 + vmovdqu8 64(%rcx,%r11,1),%zmm3 + vmovdqu8 128(%rcx,%r11,1),%zmm4 + vmovdqu8 192(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 176(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 192(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,0(%r10,%r11,1) + vmovdqu8 %zmm10,64(%r10,%r11,1) + vmovdqu8 %zmm11,128(%r10,%r11,1) + vmovdqu8 %zmm12,192(%r10,%r11,1) + + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 + vmovdqa64 %zmm7,768(%rsp) + vmovdqa64 %zmm10,832(%rsp) + vmovdqa64 %zmm11,896(%rsp) + vmovdqa64 %zmm12,960(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_175 + + vmovdqu64 288(%rsi),%zmm0 + vmovdqu64 %zmm0,704(%rsp) + + vmovdqu64 224(%rsi),%zmm3 + vmovdqu64 %zmm3,640(%rsp) + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 160(%rsi),%zmm4 + vmovdqu64 %zmm4,576(%rsp) + + vmovdqu64 96(%rsi),%zmm5 + vmovdqu64 %zmm5,512(%rsp) +.L_skip_hkeys_precomputation_175: + cmpq $512,%r8 + jb .L_message_below_32_blocks_172 + + + + cmpb $240,%r15b + jae .L_next_16_overflow_176 + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_176 +.L_next_16_overflow_176: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_176: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 256(%rcx,%r11,1),%zmm0 + vmovdqu8 320(%rcx,%r11,1),%zmm3 + vmovdqu8 384(%rcx,%r11,1),%zmm4 + vmovdqu8 448(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 176(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 192(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,256(%r10,%r11,1) + vmovdqu8 %zmm10,320(%r10,%r11,1) + vmovdqu8 %zmm11,384(%r10,%r11,1) + vmovdqu8 %zmm12,448(%r10,%r11,1) + + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 + vmovdqa64 %zmm7,1024(%rsp) + vmovdqa64 %zmm10,1088(%rsp) + vmovdqa64 %zmm11,1152(%rsp) + vmovdqa64 %zmm12,1216(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_177 + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,192(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,128(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,64(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,0(%rsp) +.L_skip_hkeys_precomputation_177: + movq $1,%r14 + addq $512,%r11 + subq $512,%r8 + + cmpq $768,%r8 + jb .L_no_more_big_nblocks_172 +.L_encrypt_big_nblocks_172: + cmpb $240,%r15b + jae .L_16_blocks_overflow_178 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_178 +.L_16_blocks_overflow_178: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_178: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_179 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_179 +.L_16_blocks_overflow_179: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_179: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_180 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_180 +.L_16_blocks_overflow_180: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_180: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 512(%rcx,%r11,1),%zmm17 + vmovdqu8 576(%rcx,%r11,1),%zmm19 + vmovdqu8 640(%rcx,%r11,1),%zmm20 + vmovdqu8 704(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + + + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpternlogq $0x96,%zmm15,%zmm12,%zmm6 + vpxorq %zmm24,%zmm6,%zmm6 + vpternlogq $0x96,%zmm10,%zmm13,%zmm7 + vpxorq %zmm25,%zmm7,%zmm7 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vextracti64x4 $1,%zmm6,%ymm12 + vpxorq %ymm12,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm12 + vpxorq %xmm12,%xmm6,%xmm6 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm6 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,512(%r10,%r11,1) + vmovdqu8 %zmm3,576(%r10,%r11,1) + vmovdqu8 %zmm4,640(%r10,%r11,1) + vmovdqu8 %zmm5,704(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1024(%rsp) + vmovdqa64 %zmm3,1088(%rsp) + vmovdqa64 %zmm4,1152(%rsp) + vmovdqa64 %zmm5,1216(%rsp) + vmovdqa64 %zmm6,%zmm14 + + addq $768,%r11 + subq $768,%r8 + cmpq $768,%r8 + jae .L_encrypt_big_nblocks_172 + +.L_no_more_big_nblocks_172: + + cmpq $512,%r8 + jae .L_encrypt_32_blocks_172 + + cmpq $256,%r8 + jae .L_encrypt_16_blocks_172 +.L_encrypt_0_blocks_ghash_32_172: + movl %r8d,%r10d + andl $~15,%r10d + movl $256,%ebx + subl %r10d,%ebx + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + addl $256,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_181 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_181 + jb .L_last_num_blocks_is_7_1_181 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_181 + jb .L_last_num_blocks_is_11_9_181 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_181 + ja .L_last_num_blocks_is_16_181 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_181 + jmp .L_last_num_blocks_is_13_181 + +.L_last_num_blocks_is_11_9_181: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_181 + ja .L_last_num_blocks_is_11_181 + jmp .L_last_num_blocks_is_9_181 + +.L_last_num_blocks_is_7_1_181: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_181 + jb .L_last_num_blocks_is_3_1_181 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_181 + je .L_last_num_blocks_is_6_181 + jmp .L_last_num_blocks_is_5_181 + +.L_last_num_blocks_is_3_1_181: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_181 + je .L_last_num_blocks_is_2_181 +.L_last_num_blocks_is_1_181: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_182 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_182 + +.L_16_blocks_overflow_182: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_182: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_183 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_183 +.L_small_initial_partial_block_183: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_183 +.L_small_initial_compute_done_183: +.L_after_reduction_183: + jmp .L_last_blocks_done_181 +.L_last_num_blocks_is_2_181: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_184 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_184 + +.L_16_blocks_overflow_184: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_184: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_185 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_185 +.L_small_initial_partial_block_185: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_185: + + orq %r8,%r8 + je .L_after_reduction_185 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_185: + jmp .L_last_blocks_done_181 +.L_last_num_blocks_is_3_181: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_186 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_186 + +.L_16_blocks_overflow_186: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_186: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_187 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_187 +.L_small_initial_partial_block_187: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_187: + + orq %r8,%r8 + je .L_after_reduction_187 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_187: + jmp .L_last_blocks_done_181 +.L_last_num_blocks_is_4_181: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_188 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_188 + +.L_16_blocks_overflow_188: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_188: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_189 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_189 +.L_small_initial_partial_block_189: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_189: + + orq %r8,%r8 + je .L_after_reduction_189 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_189: + jmp .L_last_blocks_done_181 +.L_last_num_blocks_is_5_181: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_190 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_190 + +.L_16_blocks_overflow_190: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_190: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_191 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_191 +.L_small_initial_partial_block_191: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_191: + + orq %r8,%r8 + je .L_after_reduction_191 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_191: + jmp .L_last_blocks_done_181 +.L_last_num_blocks_is_6_181: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_192 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_192 + +.L_16_blocks_overflow_192: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_192: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_193 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_193 +.L_small_initial_partial_block_193: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_193: + + orq %r8,%r8 + je .L_after_reduction_193 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_193: + jmp .L_last_blocks_done_181 +.L_last_num_blocks_is_7_181: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_194 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_194 + +.L_16_blocks_overflow_194: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_194: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_195 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_195 +.L_small_initial_partial_block_195: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_195: + + orq %r8,%r8 + je .L_after_reduction_195 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_195: + jmp .L_last_blocks_done_181 +.L_last_num_blocks_is_8_181: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_196 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_196 + +.L_16_blocks_overflow_196: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_196: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_197 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_197 +.L_small_initial_partial_block_197: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_197: + + orq %r8,%r8 + je .L_after_reduction_197 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_197: + jmp .L_last_blocks_done_181 +.L_last_num_blocks_is_9_181: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_198 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_198 + +.L_16_blocks_overflow_198: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_198: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_199 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_199 +.L_small_initial_partial_block_199: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_199: + + orq %r8,%r8 + je .L_after_reduction_199 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_199: + jmp .L_last_blocks_done_181 +.L_last_num_blocks_is_10_181: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_200 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_200 + +.L_16_blocks_overflow_200: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_200: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_201 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_201 +.L_small_initial_partial_block_201: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_201: + + orq %r8,%r8 + je .L_after_reduction_201 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_201: + jmp .L_last_blocks_done_181 +.L_last_num_blocks_is_11_181: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_202 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_202 + +.L_16_blocks_overflow_202: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_202: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_203 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_203 +.L_small_initial_partial_block_203: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_203: + + orq %r8,%r8 + je .L_after_reduction_203 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_203: + jmp .L_last_blocks_done_181 +.L_last_num_blocks_is_12_181: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_204 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_204 + +.L_16_blocks_overflow_204: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_204: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_205 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_205 +.L_small_initial_partial_block_205: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_205: + + orq %r8,%r8 + je .L_after_reduction_205 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_205: + jmp .L_last_blocks_done_181 +.L_last_num_blocks_is_13_181: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_206 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_206 + +.L_16_blocks_overflow_206: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_206: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_207 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_207 +.L_small_initial_partial_block_207: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_207: + + orq %r8,%r8 + je .L_after_reduction_207 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_207: + jmp .L_last_blocks_done_181 +.L_last_num_blocks_is_14_181: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_208 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_208 + +.L_16_blocks_overflow_208: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_208: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_209 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_209 +.L_small_initial_partial_block_209: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_209: + + orq %r8,%r8 + je .L_after_reduction_209 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_209: + jmp .L_last_blocks_done_181 +.L_last_num_blocks_is_15_181: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_210 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_210 + +.L_16_blocks_overflow_210: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_210: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_211 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_211 +.L_small_initial_partial_block_211: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_211: + + orq %r8,%r8 + je .L_after_reduction_211 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_211: + jmp .L_last_blocks_done_181 +.L_last_num_blocks_is_16_181: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_212 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_212 + +.L_16_blocks_overflow_212: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_212: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_213: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_213: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_213: + jmp .L_last_blocks_done_181 +.L_last_num_blocks_is_0_181: + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_181: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_172 +.L_encrypt_32_blocks_172: + cmpb $240,%r15b + jae .L_16_blocks_overflow_214 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_214 +.L_16_blocks_overflow_214: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_214: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_215 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_215 +.L_16_blocks_overflow_215: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_215: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + + subq $512,%r8 + addq $512,%r11 + movl %r8d,%r10d + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_216 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_216 + jb .L_last_num_blocks_is_7_1_216 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_216 + jb .L_last_num_blocks_is_11_9_216 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_216 + ja .L_last_num_blocks_is_16_216 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_216 + jmp .L_last_num_blocks_is_13_216 + +.L_last_num_blocks_is_11_9_216: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_216 + ja .L_last_num_blocks_is_11_216 + jmp .L_last_num_blocks_is_9_216 + +.L_last_num_blocks_is_7_1_216: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_216 + jb .L_last_num_blocks_is_3_1_216 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_216 + je .L_last_num_blocks_is_6_216 + jmp .L_last_num_blocks_is_5_216 + +.L_last_num_blocks_is_3_1_216: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_216 + je .L_last_num_blocks_is_2_216 +.L_last_num_blocks_is_1_216: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_217 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_217 + +.L_16_blocks_overflow_217: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_217: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_218 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_218 +.L_small_initial_partial_block_218: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_218 +.L_small_initial_compute_done_218: +.L_after_reduction_218: + jmp .L_last_blocks_done_216 +.L_last_num_blocks_is_2_216: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_219 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_219 + +.L_16_blocks_overflow_219: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_219: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_220 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_220 +.L_small_initial_partial_block_220: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_220: + + orq %r8,%r8 + je .L_after_reduction_220 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_220: + jmp .L_last_blocks_done_216 +.L_last_num_blocks_is_3_216: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_221 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_221 + +.L_16_blocks_overflow_221: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_221: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_222 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_222 +.L_small_initial_partial_block_222: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_222: + + orq %r8,%r8 + je .L_after_reduction_222 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_222: + jmp .L_last_blocks_done_216 +.L_last_num_blocks_is_4_216: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_223 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_223 + +.L_16_blocks_overflow_223: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_223: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_224 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_224 +.L_small_initial_partial_block_224: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_224: + + orq %r8,%r8 + je .L_after_reduction_224 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_224: + jmp .L_last_blocks_done_216 +.L_last_num_blocks_is_5_216: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_225 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_225 + +.L_16_blocks_overflow_225: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_225: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_226 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_226 +.L_small_initial_partial_block_226: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_226: + + orq %r8,%r8 + je .L_after_reduction_226 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_226: + jmp .L_last_blocks_done_216 +.L_last_num_blocks_is_6_216: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_227 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_227 + +.L_16_blocks_overflow_227: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_227: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_228 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_228 +.L_small_initial_partial_block_228: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_228: + + orq %r8,%r8 + je .L_after_reduction_228 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_228: + jmp .L_last_blocks_done_216 +.L_last_num_blocks_is_7_216: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_229 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_229 + +.L_16_blocks_overflow_229: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_229: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_230 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_230 +.L_small_initial_partial_block_230: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_230: + + orq %r8,%r8 + je .L_after_reduction_230 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_230: + jmp .L_last_blocks_done_216 +.L_last_num_blocks_is_8_216: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_231 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_231 + +.L_16_blocks_overflow_231: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_231: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_232 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_232 +.L_small_initial_partial_block_232: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_232: + + orq %r8,%r8 + je .L_after_reduction_232 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_232: + jmp .L_last_blocks_done_216 +.L_last_num_blocks_is_9_216: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_233 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_233 + +.L_16_blocks_overflow_233: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_233: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_234 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_234 +.L_small_initial_partial_block_234: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_234: + + orq %r8,%r8 + je .L_after_reduction_234 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_234: + jmp .L_last_blocks_done_216 +.L_last_num_blocks_is_10_216: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_235 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_235 + +.L_16_blocks_overflow_235: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_235: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_236 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_236 +.L_small_initial_partial_block_236: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_236: + + orq %r8,%r8 + je .L_after_reduction_236 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_236: + jmp .L_last_blocks_done_216 +.L_last_num_blocks_is_11_216: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_237 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_237 + +.L_16_blocks_overflow_237: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_237: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_238 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_238 +.L_small_initial_partial_block_238: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_238: + + orq %r8,%r8 + je .L_after_reduction_238 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_238: + jmp .L_last_blocks_done_216 +.L_last_num_blocks_is_12_216: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_239 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_239 + +.L_16_blocks_overflow_239: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_239: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_240 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_240 +.L_small_initial_partial_block_240: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_240: + + orq %r8,%r8 + je .L_after_reduction_240 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_240: + jmp .L_last_blocks_done_216 +.L_last_num_blocks_is_13_216: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_241 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_241 + +.L_16_blocks_overflow_241: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_241: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_242 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_242 +.L_small_initial_partial_block_242: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_242: + + orq %r8,%r8 + je .L_after_reduction_242 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_242: + jmp .L_last_blocks_done_216 +.L_last_num_blocks_is_14_216: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_243 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_243 + +.L_16_blocks_overflow_243: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_243: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_244 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_244 +.L_small_initial_partial_block_244: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_244: + + orq %r8,%r8 + je .L_after_reduction_244 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_244: + jmp .L_last_blocks_done_216 +.L_last_num_blocks_is_15_216: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_245 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_245 + +.L_16_blocks_overflow_245: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_245: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_246 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_246 +.L_small_initial_partial_block_246: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_246: + + orq %r8,%r8 + je .L_after_reduction_246 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_246: + jmp .L_last_blocks_done_216 +.L_last_num_blocks_is_16_216: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_247 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_247 + +.L_16_blocks_overflow_247: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_247: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_248: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_248: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_248: + jmp .L_last_blocks_done_216 +.L_last_num_blocks_is_0_216: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_216: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_172 +.L_encrypt_16_blocks_172: + cmpb $240,%r15b + jae .L_16_blocks_overflow_249 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_249 +.L_16_blocks_overflow_249: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_249: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 256(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 320(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 384(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 448(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_250 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_250 + jb .L_last_num_blocks_is_7_1_250 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_250 + jb .L_last_num_blocks_is_11_9_250 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_250 + ja .L_last_num_blocks_is_16_250 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_250 + jmp .L_last_num_blocks_is_13_250 + +.L_last_num_blocks_is_11_9_250: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_250 + ja .L_last_num_blocks_is_11_250 + jmp .L_last_num_blocks_is_9_250 + +.L_last_num_blocks_is_7_1_250: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_250 + jb .L_last_num_blocks_is_3_1_250 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_250 + je .L_last_num_blocks_is_6_250 + jmp .L_last_num_blocks_is_5_250 + +.L_last_num_blocks_is_3_1_250: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_250 + je .L_last_num_blocks_is_2_250 +.L_last_num_blocks_is_1_250: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_251 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_251 + +.L_16_blocks_overflow_251: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_251: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %xmm31,%xmm0,%xmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_252 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_252 +.L_small_initial_partial_block_252: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_252 +.L_small_initial_compute_done_252: +.L_after_reduction_252: + jmp .L_last_blocks_done_250 +.L_last_num_blocks_is_2_250: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_253 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_253 + +.L_16_blocks_overflow_253: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_253: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %ymm31,%ymm0,%ymm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_254 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_254 +.L_small_initial_partial_block_254: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_254: + + orq %r8,%r8 + je .L_after_reduction_254 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_254: + jmp .L_last_blocks_done_250 +.L_last_num_blocks_is_3_250: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_255 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_255 + +.L_16_blocks_overflow_255: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_255: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_256 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_256 +.L_small_initial_partial_block_256: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_256: + + orq %r8,%r8 + je .L_after_reduction_256 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_256: + jmp .L_last_blocks_done_250 +.L_last_num_blocks_is_4_250: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_257 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_257 + +.L_16_blocks_overflow_257: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_257: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_258 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_258 +.L_small_initial_partial_block_258: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_258: + + orq %r8,%r8 + je .L_after_reduction_258 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_258: + jmp .L_last_blocks_done_250 +.L_last_num_blocks_is_5_250: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_259 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_259 + +.L_16_blocks_overflow_259: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_259: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_260 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_260 +.L_small_initial_partial_block_260: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_260: + + orq %r8,%r8 + je .L_after_reduction_260 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_260: + jmp .L_last_blocks_done_250 +.L_last_num_blocks_is_6_250: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_261 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_261 + +.L_16_blocks_overflow_261: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_261: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_262 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_262 +.L_small_initial_partial_block_262: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_262: + + orq %r8,%r8 + je .L_after_reduction_262 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_262: + jmp .L_last_blocks_done_250 +.L_last_num_blocks_is_7_250: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_263 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_263 + +.L_16_blocks_overflow_263: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_263: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_264 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_264 +.L_small_initial_partial_block_264: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_264: + + orq %r8,%r8 + je .L_after_reduction_264 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_264: + jmp .L_last_blocks_done_250 +.L_last_num_blocks_is_8_250: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_265 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_265 + +.L_16_blocks_overflow_265: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_265: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_266 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_266 +.L_small_initial_partial_block_266: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_266: + + orq %r8,%r8 + je .L_after_reduction_266 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_266: + jmp .L_last_blocks_done_250 +.L_last_num_blocks_is_9_250: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_267 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_267 + +.L_16_blocks_overflow_267: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_267: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_268 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_268 +.L_small_initial_partial_block_268: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_268: + + orq %r8,%r8 + je .L_after_reduction_268 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_268: + jmp .L_last_blocks_done_250 +.L_last_num_blocks_is_10_250: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_269 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_269 + +.L_16_blocks_overflow_269: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_269: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_270 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_270 +.L_small_initial_partial_block_270: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_270: + + orq %r8,%r8 + je .L_after_reduction_270 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_270: + jmp .L_last_blocks_done_250 +.L_last_num_blocks_is_11_250: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_271 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_271 + +.L_16_blocks_overflow_271: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_271: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_272 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_272 +.L_small_initial_partial_block_272: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_272: + + orq %r8,%r8 + je .L_after_reduction_272 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_272: + jmp .L_last_blocks_done_250 +.L_last_num_blocks_is_12_250: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_273 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_273 + +.L_16_blocks_overflow_273: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_273: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_274 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_274 +.L_small_initial_partial_block_274: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_274: + + orq %r8,%r8 + je .L_after_reduction_274 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_274: + jmp .L_last_blocks_done_250 +.L_last_num_blocks_is_13_250: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_275 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_275 + +.L_16_blocks_overflow_275: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_275: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_276 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_276 +.L_small_initial_partial_block_276: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_276: + + orq %r8,%r8 + je .L_after_reduction_276 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_276: + jmp .L_last_blocks_done_250 +.L_last_num_blocks_is_14_250: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_277 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_277 + +.L_16_blocks_overflow_277: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_277: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_278 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_278 +.L_small_initial_partial_block_278: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_278: + + orq %r8,%r8 + je .L_after_reduction_278 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_278: + jmp .L_last_blocks_done_250 +.L_last_num_blocks_is_15_250: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_279 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_279 + +.L_16_blocks_overflow_279: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_279: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_280 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_280 +.L_small_initial_partial_block_280: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_280: + + orq %r8,%r8 + je .L_after_reduction_280 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_280: + jmp .L_last_blocks_done_250 +.L_last_num_blocks_is_16_250: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_281 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_281 + +.L_16_blocks_overflow_281: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_281: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_282: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_282: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_282: + jmp .L_last_blocks_done_250 +.L_last_num_blocks_is_0_250: + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_250: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_172 + +.L_message_below_32_blocks_172: + + + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_283 + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) +.L_skip_hkeys_precomputation_283: + movq $1,%r14 + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_284 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_284 + jb .L_last_num_blocks_is_7_1_284 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_284 + jb .L_last_num_blocks_is_11_9_284 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_284 + ja .L_last_num_blocks_is_16_284 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_284 + jmp .L_last_num_blocks_is_13_284 + +.L_last_num_blocks_is_11_9_284: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_284 + ja .L_last_num_blocks_is_11_284 + jmp .L_last_num_blocks_is_9_284 + +.L_last_num_blocks_is_7_1_284: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_284 + jb .L_last_num_blocks_is_3_1_284 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_284 + je .L_last_num_blocks_is_6_284 + jmp .L_last_num_blocks_is_5_284 + +.L_last_num_blocks_is_3_1_284: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_284 + je .L_last_num_blocks_is_2_284 +.L_last_num_blocks_is_1_284: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_285 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_285 + +.L_16_blocks_overflow_285: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_285: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_286 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_286 +.L_small_initial_partial_block_286: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_286 +.L_small_initial_compute_done_286: +.L_after_reduction_286: + jmp .L_last_blocks_done_284 +.L_last_num_blocks_is_2_284: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_287 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_287 + +.L_16_blocks_overflow_287: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_287: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_288 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_288 +.L_small_initial_partial_block_288: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_288: + + orq %r8,%r8 + je .L_after_reduction_288 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_288: + jmp .L_last_blocks_done_284 +.L_last_num_blocks_is_3_284: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_289 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_289 + +.L_16_blocks_overflow_289: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_289: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_290 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_290 +.L_small_initial_partial_block_290: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_290: + + orq %r8,%r8 + je .L_after_reduction_290 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_290: + jmp .L_last_blocks_done_284 +.L_last_num_blocks_is_4_284: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_291 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_291 + +.L_16_blocks_overflow_291: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_291: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_292 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_292 +.L_small_initial_partial_block_292: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_292: + + orq %r8,%r8 + je .L_after_reduction_292 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_292: + jmp .L_last_blocks_done_284 +.L_last_num_blocks_is_5_284: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_293 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_293 + +.L_16_blocks_overflow_293: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_293: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_294 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_294 +.L_small_initial_partial_block_294: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_294: + + orq %r8,%r8 + je .L_after_reduction_294 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_294: + jmp .L_last_blocks_done_284 +.L_last_num_blocks_is_6_284: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_295 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_295 + +.L_16_blocks_overflow_295: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_295: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_296 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_296 +.L_small_initial_partial_block_296: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_296: + + orq %r8,%r8 + je .L_after_reduction_296 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_296: + jmp .L_last_blocks_done_284 +.L_last_num_blocks_is_7_284: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_297 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_297 + +.L_16_blocks_overflow_297: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_297: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_298 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_298 +.L_small_initial_partial_block_298: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_298: + + orq %r8,%r8 + je .L_after_reduction_298 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_298: + jmp .L_last_blocks_done_284 +.L_last_num_blocks_is_8_284: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_299 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_299 + +.L_16_blocks_overflow_299: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_299: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_300 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_300 +.L_small_initial_partial_block_300: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_300: + + orq %r8,%r8 + je .L_after_reduction_300 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_300: + jmp .L_last_blocks_done_284 +.L_last_num_blocks_is_9_284: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_301 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_301 + +.L_16_blocks_overflow_301: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_301: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_302 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_302 +.L_small_initial_partial_block_302: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_302: + + orq %r8,%r8 + je .L_after_reduction_302 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_302: + jmp .L_last_blocks_done_284 +.L_last_num_blocks_is_10_284: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_303 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_303 + +.L_16_blocks_overflow_303: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_303: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_304 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_304 +.L_small_initial_partial_block_304: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_304: + + orq %r8,%r8 + je .L_after_reduction_304 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_304: + jmp .L_last_blocks_done_284 +.L_last_num_blocks_is_11_284: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_305 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_305 + +.L_16_blocks_overflow_305: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_305: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_306 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_306 +.L_small_initial_partial_block_306: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_306: + + orq %r8,%r8 + je .L_after_reduction_306 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_306: + jmp .L_last_blocks_done_284 +.L_last_num_blocks_is_12_284: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_307 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_307 + +.L_16_blocks_overflow_307: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_307: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_308 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_308 +.L_small_initial_partial_block_308: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_308: + + orq %r8,%r8 + je .L_after_reduction_308 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_308: + jmp .L_last_blocks_done_284 +.L_last_num_blocks_is_13_284: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_309 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_309 + +.L_16_blocks_overflow_309: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_309: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_310 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_310 +.L_small_initial_partial_block_310: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_310: + + orq %r8,%r8 + je .L_after_reduction_310 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_310: + jmp .L_last_blocks_done_284 +.L_last_num_blocks_is_14_284: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_311 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_311 + +.L_16_blocks_overflow_311: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_311: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_312 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_312 +.L_small_initial_partial_block_312: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_312: + + orq %r8,%r8 + je .L_after_reduction_312 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_312: + jmp .L_last_blocks_done_284 +.L_last_num_blocks_is_15_284: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_313 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_313 + +.L_16_blocks_overflow_313: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_313: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_314 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_314 +.L_small_initial_partial_block_314: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_314: + + orq %r8,%r8 + je .L_after_reduction_314 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_314: + jmp .L_last_blocks_done_284 +.L_last_num_blocks_is_16_284: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_315 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_315 + +.L_16_blocks_overflow_315: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_315: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_316: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_316: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_316: + jmp .L_last_blocks_done_284 +.L_last_num_blocks_is_0_284: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_284: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_172 + +.L_message_below_equal_16_blocks_172: + + + movl %r8d,%r12d + addl $15,%r12d + shrl $4,%r12d + cmpq $8,%r12 + je .L_small_initial_num_blocks_is_8_317 + jl .L_small_initial_num_blocks_is_7_1_317 + + + cmpq $12,%r12 + je .L_small_initial_num_blocks_is_12_317 + jl .L_small_initial_num_blocks_is_11_9_317 + + + cmpq $16,%r12 + je .L_small_initial_num_blocks_is_16_317 + cmpq $15,%r12 + je .L_small_initial_num_blocks_is_15_317 + cmpq $14,%r12 + je .L_small_initial_num_blocks_is_14_317 + jmp .L_small_initial_num_blocks_is_13_317 + +.L_small_initial_num_blocks_is_11_9_317: + + cmpq $11,%r12 + je .L_small_initial_num_blocks_is_11_317 + cmpq $10,%r12 + je .L_small_initial_num_blocks_is_10_317 + jmp .L_small_initial_num_blocks_is_9_317 + +.L_small_initial_num_blocks_is_7_1_317: + cmpq $4,%r12 + je .L_small_initial_num_blocks_is_4_317 + jl .L_small_initial_num_blocks_is_3_1_317 + + cmpq $7,%r12 + je .L_small_initial_num_blocks_is_7_317 + cmpq $6,%r12 + je .L_small_initial_num_blocks_is_6_317 + jmp .L_small_initial_num_blocks_is_5_317 + +.L_small_initial_num_blocks_is_3_1_317: + + cmpq $3,%r12 + je .L_small_initial_num_blocks_is_3_317 + cmpq $2,%r12 + je .L_small_initial_num_blocks_is_2_317 + + + + + +.L_small_initial_num_blocks_is_1_317: + vmovdqa64 SHUF_MASK(%rip),%xmm29 + vpaddd ONE(%rip),%xmm2,%xmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm0,%xmm2 + vpshufb %xmm29,%xmm0,%xmm0 + vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %xmm15,%xmm0,%xmm0 + vpxorq %xmm6,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm6 + vextracti32x4 $0,%zmm6,%xmm13 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_318 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_318 +.L_small_initial_partial_block_318: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + + + + + + + + + + + + vpxorq %xmm13,%xmm14,%xmm14 + + jmp .L_after_reduction_318 +.L_small_initial_compute_done_318: +.L_after_reduction_318: + jmp .L_small_initial_blocks_encrypted_317 +.L_small_initial_num_blocks_is_2_317: + vmovdqa64 SHUF_MASK(%rip),%ymm29 + vshufi64x2 $0,%ymm2,%ymm2,%ymm0 + vpaddd ddq_add_1234(%rip),%ymm0,%ymm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm0,%xmm2 + vpshufb %ymm29,%ymm0,%ymm0 + vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %ymm15,%ymm0,%ymm0 + vpxorq %ymm6,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm6 + vextracti32x4 $1,%zmm6,%xmm13 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_319 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_319 +.L_small_initial_partial_block_319: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_319: + + orq %r8,%r8 + je .L_after_reduction_319 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_319: + jmp .L_small_initial_blocks_encrypted_317 +.L_small_initial_num_blocks_is_3_317: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vextracti32x4 $2,%zmm6,%xmm13 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_320 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_320 +.L_small_initial_partial_block_320: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_320: + + orq %r8,%r8 + je .L_after_reduction_320 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_320: + jmp .L_small_initial_blocks_encrypted_317 +.L_small_initial_num_blocks_is_4_317: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vextracti32x4 $3,%zmm6,%xmm13 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_321 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_321 +.L_small_initial_partial_block_321: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_321: + + orq %r8,%r8 + je .L_after_reduction_321 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_321: + jmp .L_small_initial_blocks_encrypted_317 +.L_small_initial_num_blocks_is_5_317: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %xmm15,%xmm3,%xmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %xmm7,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %xmm29,%xmm3,%xmm7 + vextracti32x4 $0,%zmm7,%xmm13 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_322 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_322 +.L_small_initial_partial_block_322: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_322: + + orq %r8,%r8 + je .L_after_reduction_322 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_322: + jmp .L_small_initial_blocks_encrypted_317 +.L_small_initial_num_blocks_is_6_317: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %ymm15,%ymm3,%ymm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %ymm7,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %ymm29,%ymm3,%ymm7 + vextracti32x4 $1,%zmm7,%xmm13 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_323 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_323 +.L_small_initial_partial_block_323: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_323: + + orq %r8,%r8 + je .L_after_reduction_323 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_323: + jmp .L_small_initial_blocks_encrypted_317 +.L_small_initial_num_blocks_is_7_317: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vextracti32x4 $2,%zmm7,%xmm13 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_324 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_324 +.L_small_initial_partial_block_324: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_324: + + orq %r8,%r8 + je .L_after_reduction_324 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_324: + jmp .L_small_initial_blocks_encrypted_317 +.L_small_initial_num_blocks_is_8_317: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vextracti32x4 $3,%zmm7,%xmm13 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_325 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_325 +.L_small_initial_partial_block_325: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_325: + + orq %r8,%r8 + je .L_after_reduction_325 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_325: + jmp .L_small_initial_blocks_encrypted_317 +.L_small_initial_num_blocks_is_9_317: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %xmm15,%xmm4,%xmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %xmm10,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %xmm29,%xmm4,%xmm10 + vextracti32x4 $0,%zmm10,%xmm13 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_326 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_326 +.L_small_initial_partial_block_326: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_326: + + orq %r8,%r8 + je .L_after_reduction_326 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_326: + jmp .L_small_initial_blocks_encrypted_317 +.L_small_initial_num_blocks_is_10_317: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %ymm15,%ymm4,%ymm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %ymm10,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %ymm29,%ymm4,%ymm10 + vextracti32x4 $1,%zmm10,%xmm13 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_327 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_327 +.L_small_initial_partial_block_327: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_327: + + orq %r8,%r8 + je .L_after_reduction_327 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_327: + jmp .L_small_initial_blocks_encrypted_317 +.L_small_initial_num_blocks_is_11_317: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vextracti32x4 $2,%zmm10,%xmm13 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_328 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_328 +.L_small_initial_partial_block_328: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_328: + + orq %r8,%r8 + je .L_after_reduction_328 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_328: + jmp .L_small_initial_blocks_encrypted_317 +.L_small_initial_num_blocks_is_12_317: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vextracti32x4 $3,%zmm10,%xmm13 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_329 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_329 +.L_small_initial_partial_block_329: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_329: + + orq %r8,%r8 + je .L_after_reduction_329 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_329: + jmp .L_small_initial_blocks_encrypted_317 +.L_small_initial_num_blocks_is_13_317: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %xmm15,%xmm5,%xmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %xmm11,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %xmm29,%xmm5,%xmm11 + vextracti32x4 $0,%zmm11,%xmm13 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_330 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_330 +.L_small_initial_partial_block_330: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_330: + + orq %r8,%r8 + je .L_after_reduction_330 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_330: + jmp .L_small_initial_blocks_encrypted_317 +.L_small_initial_num_blocks_is_14_317: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %ymm15,%ymm5,%ymm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %ymm11,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %ymm29,%ymm5,%ymm11 + vextracti32x4 $1,%zmm11,%xmm13 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_331 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_331 +.L_small_initial_partial_block_331: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_331: + + orq %r8,%r8 + je .L_after_reduction_331 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_331: + jmp .L_small_initial_blocks_encrypted_317 +.L_small_initial_num_blocks_is_15_317: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %zmm29,%zmm5,%zmm11 + vextracti32x4 $2,%zmm11,%xmm13 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_332 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_332 +.L_small_initial_partial_block_332: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_332: + + orq %r8,%r8 + je .L_after_reduction_332 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_332: + jmp .L_small_initial_blocks_encrypted_317 +.L_small_initial_num_blocks_is_16_317: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %zmm29,%zmm5,%zmm11 + vextracti32x4 $3,%zmm11,%xmm13 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_333: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_333: + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_333: +.L_small_initial_blocks_encrypted_317: +.L_ghash_done_172: + vmovdqu64 %xmm2,0(%rsi) + vmovdqu64 %xmm14,64(%rsi) +.L_enc_dec_done_172: + jmp .Lexit_gcm_encrypt +.align 32 +.Laes_gcm_encrypt_256_avx512: + orq %r8,%r8 + je .L_enc_dec_done_334 + xorq %r14,%r14 + vmovdqu64 64(%rsi),%xmm14 + + movq (%rdx),%r11 + orq %r11,%r11 + je .L_partial_block_done_335 + movl $16,%r10d + leaq byte_len_to_mask_table(%rip),%r12 + cmpq %r10,%r8 + cmovcq %r8,%r10 + kmovw (%r12,%r10,2),%k1 + vmovdqu8 (%rcx),%xmm0{%k1}{z} + + vmovdqu64 16(%rsi),%xmm3 + vmovdqu64 336(%rsi),%xmm4 + + + + leaq SHIFT_MASK(%rip),%r12 + addq %r11,%r12 + vmovdqu64 (%r12),%xmm5 + vpshufb %xmm5,%xmm3,%xmm3 + vpxorq %xmm0,%xmm3,%xmm3 + + + leaq (%r8,%r11,1),%r13 + subq $16,%r13 + jge .L_no_extra_mask_335 + subq %r13,%r12 +.L_no_extra_mask_335: + + + + vmovdqu64 16(%r12),%xmm0 + vpand %xmm0,%xmm3,%xmm3 + vpshufb SHUF_MASK(%rip),%xmm3,%xmm3 + vpshufb %xmm5,%xmm3,%xmm3 + vpxorq %xmm3,%xmm14,%xmm14 + cmpq $0,%r13 + jl .L_partial_incomplete_335 + + vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7 + vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10 + vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11 + vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14 + vpxorq %xmm11,%xmm14,%xmm14 + + vpsrldq $8,%xmm14,%xmm11 + vpslldq $8,%xmm14,%xmm14 + vpxorq %xmm11,%xmm7,%xmm7 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vmovdqu64 POLY2(%rip),%xmm11 + + vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10 + vpslldq $8,%xmm10,%xmm10 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10 + vpsrldq $4,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14 + vpslldq $4,%xmm14,%xmm14 + + vpternlogq $0x96,%xmm10,%xmm7,%xmm14 + + movq $0,(%rdx) + + movq %r11,%r12 + movq $16,%r11 + subq %r12,%r11 + jmp .L_enc_dec_done_335 + +.L_partial_incomplete_335: + addq %r8,(%rdx) + movq %r8,%r11 + +.L_enc_dec_done_335: + + + leaq byte_len_to_mask_table(%rip),%r12 + kmovw (%r12,%r11,2),%k1 + vmovdqu64 %xmm14,64(%rsi) + + vpshufb SHUF_MASK(%rip),%xmm3,%xmm3 + vpshufb %xmm5,%xmm3,%xmm3 + movq %r9,%r12 + vmovdqu8 %xmm3,(%r12){%k1} +.L_partial_block_done_335: + vmovdqu64 0(%rsi),%xmm2 + subq %r11,%r8 + je .L_enc_dec_done_334 + cmpq $256,%r8 + jbe .L_message_below_equal_16_blocks_334 + + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vmovdqa64 ddq_addbe_4444(%rip),%zmm27 + vmovdqa64 ddq_addbe_1234(%rip),%zmm28 + + + + + + + vmovd %xmm2,%r15d + andl $255,%r15d + + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpshufb %zmm29,%zmm2,%zmm2 + + + + cmpb $240,%r15b + jae .L_next_16_overflow_336 + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_336 +.L_next_16_overflow_336: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_336: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 0(%rcx,%r11,1),%zmm0 + vmovdqu8 64(%rcx,%r11,1),%zmm3 + vmovdqu8 128(%rcx,%r11,1),%zmm4 + vmovdqu8 192(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 176(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 192(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 208(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 224(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,0(%r10,%r11,1) + vmovdqu8 %zmm10,64(%r10,%r11,1) + vmovdqu8 %zmm11,128(%r10,%r11,1) + vmovdqu8 %zmm12,192(%r10,%r11,1) + + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 + vmovdqa64 %zmm7,768(%rsp) + vmovdqa64 %zmm10,832(%rsp) + vmovdqa64 %zmm11,896(%rsp) + vmovdqa64 %zmm12,960(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_337 + + vmovdqu64 288(%rsi),%zmm0 + vmovdqu64 %zmm0,704(%rsp) + + vmovdqu64 224(%rsi),%zmm3 + vmovdqu64 %zmm3,640(%rsp) + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 160(%rsi),%zmm4 + vmovdqu64 %zmm4,576(%rsp) + + vmovdqu64 96(%rsi),%zmm5 + vmovdqu64 %zmm5,512(%rsp) +.L_skip_hkeys_precomputation_337: + cmpq $512,%r8 + jb .L_message_below_32_blocks_334 + + + + cmpb $240,%r15b + jae .L_next_16_overflow_338 + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_338 +.L_next_16_overflow_338: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_338: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 256(%rcx,%r11,1),%zmm0 + vmovdqu8 320(%rcx,%r11,1),%zmm3 + vmovdqu8 384(%rcx,%r11,1),%zmm4 + vmovdqu8 448(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 176(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 192(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 208(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 224(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,256(%r10,%r11,1) + vmovdqu8 %zmm10,320(%r10,%r11,1) + vmovdqu8 %zmm11,384(%r10,%r11,1) + vmovdqu8 %zmm12,448(%r10,%r11,1) + + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 + vmovdqa64 %zmm7,1024(%rsp) + vmovdqa64 %zmm10,1088(%rsp) + vmovdqa64 %zmm11,1152(%rsp) + vmovdqa64 %zmm12,1216(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_339 + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,192(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,128(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,64(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,0(%rsp) +.L_skip_hkeys_precomputation_339: + movq $1,%r14 + addq $512,%r11 + subq $512,%r8 + + cmpq $768,%r8 + jb .L_no_more_big_nblocks_334 +.L_encrypt_big_nblocks_334: + cmpb $240,%r15b + jae .L_16_blocks_overflow_340 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_340 +.L_16_blocks_overflow_340: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_340: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_341 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_341 +.L_16_blocks_overflow_341: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_341: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_342 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_342 +.L_16_blocks_overflow_342: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_342: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 512(%rcx,%r11,1),%zmm17 + vmovdqu8 576(%rcx,%r11,1),%zmm19 + vmovdqu8 640(%rcx,%r11,1),%zmm20 + vmovdqu8 704(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + + + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpternlogq $0x96,%zmm15,%zmm12,%zmm6 + vpxorq %zmm24,%zmm6,%zmm6 + vpternlogq $0x96,%zmm10,%zmm13,%zmm7 + vpxorq %zmm25,%zmm7,%zmm7 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vextracti64x4 $1,%zmm6,%ymm12 + vpxorq %ymm12,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm12 + vpxorq %xmm12,%xmm6,%xmm6 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm6 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,512(%r10,%r11,1) + vmovdqu8 %zmm3,576(%r10,%r11,1) + vmovdqu8 %zmm4,640(%r10,%r11,1) + vmovdqu8 %zmm5,704(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1024(%rsp) + vmovdqa64 %zmm3,1088(%rsp) + vmovdqa64 %zmm4,1152(%rsp) + vmovdqa64 %zmm5,1216(%rsp) + vmovdqa64 %zmm6,%zmm14 + + addq $768,%r11 + subq $768,%r8 + cmpq $768,%r8 + jae .L_encrypt_big_nblocks_334 + +.L_no_more_big_nblocks_334: + + cmpq $512,%r8 + jae .L_encrypt_32_blocks_334 + + cmpq $256,%r8 + jae .L_encrypt_16_blocks_334 +.L_encrypt_0_blocks_ghash_32_334: + movl %r8d,%r10d + andl $~15,%r10d + movl $256,%ebx + subl %r10d,%ebx + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + addl $256,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_343 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_343 + jb .L_last_num_blocks_is_7_1_343 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_343 + jb .L_last_num_blocks_is_11_9_343 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_343 + ja .L_last_num_blocks_is_16_343 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_343 + jmp .L_last_num_blocks_is_13_343 + +.L_last_num_blocks_is_11_9_343: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_343 + ja .L_last_num_blocks_is_11_343 + jmp .L_last_num_blocks_is_9_343 + +.L_last_num_blocks_is_7_1_343: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_343 + jb .L_last_num_blocks_is_3_1_343 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_343 + je .L_last_num_blocks_is_6_343 + jmp .L_last_num_blocks_is_5_343 + +.L_last_num_blocks_is_3_1_343: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_343 + je .L_last_num_blocks_is_2_343 +.L_last_num_blocks_is_1_343: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_344 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_344 + +.L_16_blocks_overflow_344: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_344: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_345 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_345 +.L_small_initial_partial_block_345: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_345 +.L_small_initial_compute_done_345: +.L_after_reduction_345: + jmp .L_last_blocks_done_343 +.L_last_num_blocks_is_2_343: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_346 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_346 + +.L_16_blocks_overflow_346: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_346: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_347 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_347 +.L_small_initial_partial_block_347: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_347: + + orq %r8,%r8 + je .L_after_reduction_347 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_347: + jmp .L_last_blocks_done_343 +.L_last_num_blocks_is_3_343: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_348 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_348 + +.L_16_blocks_overflow_348: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_348: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_349 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_349 +.L_small_initial_partial_block_349: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_349: + + orq %r8,%r8 + je .L_after_reduction_349 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_349: + jmp .L_last_blocks_done_343 +.L_last_num_blocks_is_4_343: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_350 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_350 + +.L_16_blocks_overflow_350: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_350: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_351 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_351 +.L_small_initial_partial_block_351: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_351: + + orq %r8,%r8 + je .L_after_reduction_351 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_351: + jmp .L_last_blocks_done_343 +.L_last_num_blocks_is_5_343: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_352 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_352 + +.L_16_blocks_overflow_352: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_352: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_353 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_353 +.L_small_initial_partial_block_353: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_353: + + orq %r8,%r8 + je .L_after_reduction_353 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_353: + jmp .L_last_blocks_done_343 +.L_last_num_blocks_is_6_343: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_354 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_354 + +.L_16_blocks_overflow_354: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_354: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_355 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_355 +.L_small_initial_partial_block_355: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_355: + + orq %r8,%r8 + je .L_after_reduction_355 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_355: + jmp .L_last_blocks_done_343 +.L_last_num_blocks_is_7_343: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_356 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_356 + +.L_16_blocks_overflow_356: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_356: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_357 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_357 +.L_small_initial_partial_block_357: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_357: + + orq %r8,%r8 + je .L_after_reduction_357 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_357: + jmp .L_last_blocks_done_343 +.L_last_num_blocks_is_8_343: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_358 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_358 + +.L_16_blocks_overflow_358: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_358: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_359 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_359 +.L_small_initial_partial_block_359: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_359: + + orq %r8,%r8 + je .L_after_reduction_359 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_359: + jmp .L_last_blocks_done_343 +.L_last_num_blocks_is_9_343: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_360 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_360 + +.L_16_blocks_overflow_360: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_360: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_361 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_361 +.L_small_initial_partial_block_361: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_361: + + orq %r8,%r8 + je .L_after_reduction_361 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_361: + jmp .L_last_blocks_done_343 +.L_last_num_blocks_is_10_343: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_362 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_362 + +.L_16_blocks_overflow_362: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_362: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_363 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_363 +.L_small_initial_partial_block_363: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_363: + + orq %r8,%r8 + je .L_after_reduction_363 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_363: + jmp .L_last_blocks_done_343 +.L_last_num_blocks_is_11_343: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_364 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_364 + +.L_16_blocks_overflow_364: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_364: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_365 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_365 +.L_small_initial_partial_block_365: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_365: + + orq %r8,%r8 + je .L_after_reduction_365 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_365: + jmp .L_last_blocks_done_343 +.L_last_num_blocks_is_12_343: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_366 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_366 + +.L_16_blocks_overflow_366: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_366: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_367 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_367 +.L_small_initial_partial_block_367: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_367: + + orq %r8,%r8 + je .L_after_reduction_367 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_367: + jmp .L_last_blocks_done_343 +.L_last_num_blocks_is_13_343: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_368 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_368 + +.L_16_blocks_overflow_368: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_368: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_369 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_369 +.L_small_initial_partial_block_369: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_369: + + orq %r8,%r8 + je .L_after_reduction_369 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_369: + jmp .L_last_blocks_done_343 +.L_last_num_blocks_is_14_343: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_370 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_370 + +.L_16_blocks_overflow_370: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_370: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_371 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_371 +.L_small_initial_partial_block_371: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_371: + + orq %r8,%r8 + je .L_after_reduction_371 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_371: + jmp .L_last_blocks_done_343 +.L_last_num_blocks_is_15_343: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_372 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_372 + +.L_16_blocks_overflow_372: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_372: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_373 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_373 +.L_small_initial_partial_block_373: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_373: + + orq %r8,%r8 + je .L_after_reduction_373 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_373: + jmp .L_last_blocks_done_343 +.L_last_num_blocks_is_16_343: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_374 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_374 + +.L_16_blocks_overflow_374: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_374: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_375: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_375: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_375: + jmp .L_last_blocks_done_343 +.L_last_num_blocks_is_0_343: + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_343: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_334 +.L_encrypt_32_blocks_334: + cmpb $240,%r15b + jae .L_16_blocks_overflow_376 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_376 +.L_16_blocks_overflow_376: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_376: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_377 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_377 +.L_16_blocks_overflow_377: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_377: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + + subq $512,%r8 + addq $512,%r11 + movl %r8d,%r10d + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_378 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_378 + jb .L_last_num_blocks_is_7_1_378 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_378 + jb .L_last_num_blocks_is_11_9_378 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_378 + ja .L_last_num_blocks_is_16_378 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_378 + jmp .L_last_num_blocks_is_13_378 + +.L_last_num_blocks_is_11_9_378: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_378 + ja .L_last_num_blocks_is_11_378 + jmp .L_last_num_blocks_is_9_378 + +.L_last_num_blocks_is_7_1_378: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_378 + jb .L_last_num_blocks_is_3_1_378 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_378 + je .L_last_num_blocks_is_6_378 + jmp .L_last_num_blocks_is_5_378 + +.L_last_num_blocks_is_3_1_378: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_378 + je .L_last_num_blocks_is_2_378 +.L_last_num_blocks_is_1_378: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_379 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_379 + +.L_16_blocks_overflow_379: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_379: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_380 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_380 +.L_small_initial_partial_block_380: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_380 +.L_small_initial_compute_done_380: +.L_after_reduction_380: + jmp .L_last_blocks_done_378 +.L_last_num_blocks_is_2_378: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_381 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_381 + +.L_16_blocks_overflow_381: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_381: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_382 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_382 +.L_small_initial_partial_block_382: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_382: + + orq %r8,%r8 + je .L_after_reduction_382 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_382: + jmp .L_last_blocks_done_378 +.L_last_num_blocks_is_3_378: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_383 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_383 + +.L_16_blocks_overflow_383: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_383: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_384 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_384 +.L_small_initial_partial_block_384: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_384: + + orq %r8,%r8 + je .L_after_reduction_384 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_384: + jmp .L_last_blocks_done_378 +.L_last_num_blocks_is_4_378: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_385 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_385 + +.L_16_blocks_overflow_385: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_385: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_386 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_386 +.L_small_initial_partial_block_386: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_386: + + orq %r8,%r8 + je .L_after_reduction_386 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_386: + jmp .L_last_blocks_done_378 +.L_last_num_blocks_is_5_378: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_387 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_387 + +.L_16_blocks_overflow_387: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_387: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_388 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_388 +.L_small_initial_partial_block_388: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_388: + + orq %r8,%r8 + je .L_after_reduction_388 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_388: + jmp .L_last_blocks_done_378 +.L_last_num_blocks_is_6_378: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_389 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_389 + +.L_16_blocks_overflow_389: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_389: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_390 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_390 +.L_small_initial_partial_block_390: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_390: + + orq %r8,%r8 + je .L_after_reduction_390 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_390: + jmp .L_last_blocks_done_378 +.L_last_num_blocks_is_7_378: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_391 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_391 + +.L_16_blocks_overflow_391: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_391: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_392 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_392 +.L_small_initial_partial_block_392: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_392: + + orq %r8,%r8 + je .L_after_reduction_392 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_392: + jmp .L_last_blocks_done_378 +.L_last_num_blocks_is_8_378: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_393 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_393 + +.L_16_blocks_overflow_393: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_393: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_394 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_394 +.L_small_initial_partial_block_394: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_394: + + orq %r8,%r8 + je .L_after_reduction_394 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_394: + jmp .L_last_blocks_done_378 +.L_last_num_blocks_is_9_378: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_395 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_395 + +.L_16_blocks_overflow_395: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_395: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_396 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_396 +.L_small_initial_partial_block_396: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_396: + + orq %r8,%r8 + je .L_after_reduction_396 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_396: + jmp .L_last_blocks_done_378 +.L_last_num_blocks_is_10_378: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_397 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_397 + +.L_16_blocks_overflow_397: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_397: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_398 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_398 +.L_small_initial_partial_block_398: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_398: + + orq %r8,%r8 + je .L_after_reduction_398 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_398: + jmp .L_last_blocks_done_378 +.L_last_num_blocks_is_11_378: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_399 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_399 + +.L_16_blocks_overflow_399: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_399: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_400 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_400 +.L_small_initial_partial_block_400: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_400: + + orq %r8,%r8 + je .L_after_reduction_400 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_400: + jmp .L_last_blocks_done_378 +.L_last_num_blocks_is_12_378: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_401 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_401 + +.L_16_blocks_overflow_401: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_401: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_402 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_402 +.L_small_initial_partial_block_402: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_402: + + orq %r8,%r8 + je .L_after_reduction_402 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_402: + jmp .L_last_blocks_done_378 +.L_last_num_blocks_is_13_378: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_403 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_403 + +.L_16_blocks_overflow_403: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_403: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_404 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_404 +.L_small_initial_partial_block_404: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_404: + + orq %r8,%r8 + je .L_after_reduction_404 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_404: + jmp .L_last_blocks_done_378 +.L_last_num_blocks_is_14_378: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_405 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_405 + +.L_16_blocks_overflow_405: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_405: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_406 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_406 +.L_small_initial_partial_block_406: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_406: + + orq %r8,%r8 + je .L_after_reduction_406 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_406: + jmp .L_last_blocks_done_378 +.L_last_num_blocks_is_15_378: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_407 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_407 + +.L_16_blocks_overflow_407: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_407: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_408 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_408 +.L_small_initial_partial_block_408: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_408: + + orq %r8,%r8 + je .L_after_reduction_408 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_408: + jmp .L_last_blocks_done_378 +.L_last_num_blocks_is_16_378: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_409 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_409 + +.L_16_blocks_overflow_409: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_409: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_410: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_410: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_410: + jmp .L_last_blocks_done_378 +.L_last_num_blocks_is_0_378: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_378: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_334 +.L_encrypt_16_blocks_334: + cmpb $240,%r15b + jae .L_16_blocks_overflow_411 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_411 +.L_16_blocks_overflow_411: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_411: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 256(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 320(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 384(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 448(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_412 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_412 + jb .L_last_num_blocks_is_7_1_412 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_412 + jb .L_last_num_blocks_is_11_9_412 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_412 + ja .L_last_num_blocks_is_16_412 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_412 + jmp .L_last_num_blocks_is_13_412 + +.L_last_num_blocks_is_11_9_412: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_412 + ja .L_last_num_blocks_is_11_412 + jmp .L_last_num_blocks_is_9_412 + +.L_last_num_blocks_is_7_1_412: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_412 + jb .L_last_num_blocks_is_3_1_412 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_412 + je .L_last_num_blocks_is_6_412 + jmp .L_last_num_blocks_is_5_412 + +.L_last_num_blocks_is_3_1_412: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_412 + je .L_last_num_blocks_is_2_412 +.L_last_num_blocks_is_1_412: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_413 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_413 + +.L_16_blocks_overflow_413: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_413: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %xmm31,%xmm0,%xmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_414 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_414 +.L_small_initial_partial_block_414: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_414 +.L_small_initial_compute_done_414: +.L_after_reduction_414: + jmp .L_last_blocks_done_412 +.L_last_num_blocks_is_2_412: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_415 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_415 + +.L_16_blocks_overflow_415: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_415: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %ymm31,%ymm0,%ymm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_416 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_416 +.L_small_initial_partial_block_416: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_416: + + orq %r8,%r8 + je .L_after_reduction_416 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_416: + jmp .L_last_blocks_done_412 +.L_last_num_blocks_is_3_412: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_417 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_417 + +.L_16_blocks_overflow_417: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_417: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_418 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_418 +.L_small_initial_partial_block_418: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_418: + + orq %r8,%r8 + je .L_after_reduction_418 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_418: + jmp .L_last_blocks_done_412 +.L_last_num_blocks_is_4_412: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_419 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_419 + +.L_16_blocks_overflow_419: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_419: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_420 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_420 +.L_small_initial_partial_block_420: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_420: + + orq %r8,%r8 + je .L_after_reduction_420 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_420: + jmp .L_last_blocks_done_412 +.L_last_num_blocks_is_5_412: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_421 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_421 + +.L_16_blocks_overflow_421: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_421: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_422 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_422 +.L_small_initial_partial_block_422: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_422: + + orq %r8,%r8 + je .L_after_reduction_422 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_422: + jmp .L_last_blocks_done_412 +.L_last_num_blocks_is_6_412: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_423 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_423 + +.L_16_blocks_overflow_423: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_423: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_424 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_424 +.L_small_initial_partial_block_424: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_424: + + orq %r8,%r8 + je .L_after_reduction_424 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_424: + jmp .L_last_blocks_done_412 +.L_last_num_blocks_is_7_412: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_425 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_425 + +.L_16_blocks_overflow_425: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_425: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_426 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_426 +.L_small_initial_partial_block_426: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_426: + + orq %r8,%r8 + je .L_after_reduction_426 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_426: + jmp .L_last_blocks_done_412 +.L_last_num_blocks_is_8_412: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_427 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_427 + +.L_16_blocks_overflow_427: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_427: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_428 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_428 +.L_small_initial_partial_block_428: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_428: + + orq %r8,%r8 + je .L_after_reduction_428 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_428: + jmp .L_last_blocks_done_412 +.L_last_num_blocks_is_9_412: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_429 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_429 + +.L_16_blocks_overflow_429: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_429: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_430 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_430 +.L_small_initial_partial_block_430: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_430: + + orq %r8,%r8 + je .L_after_reduction_430 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_430: + jmp .L_last_blocks_done_412 +.L_last_num_blocks_is_10_412: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_431 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_431 + +.L_16_blocks_overflow_431: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_431: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_432 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_432 +.L_small_initial_partial_block_432: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_432: + + orq %r8,%r8 + je .L_after_reduction_432 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_432: + jmp .L_last_blocks_done_412 +.L_last_num_blocks_is_11_412: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_433 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_433 + +.L_16_blocks_overflow_433: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_433: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_434 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_434 +.L_small_initial_partial_block_434: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_434: + + orq %r8,%r8 + je .L_after_reduction_434 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_434: + jmp .L_last_blocks_done_412 +.L_last_num_blocks_is_12_412: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_435 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_435 + +.L_16_blocks_overflow_435: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_435: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_436 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_436 +.L_small_initial_partial_block_436: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_436: + + orq %r8,%r8 + je .L_after_reduction_436 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_436: + jmp .L_last_blocks_done_412 +.L_last_num_blocks_is_13_412: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_437 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_437 + +.L_16_blocks_overflow_437: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_437: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_438 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_438 +.L_small_initial_partial_block_438: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_438: + + orq %r8,%r8 + je .L_after_reduction_438 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_438: + jmp .L_last_blocks_done_412 +.L_last_num_blocks_is_14_412: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_439 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_439 + +.L_16_blocks_overflow_439: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_439: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_440 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_440 +.L_small_initial_partial_block_440: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_440: + + orq %r8,%r8 + je .L_after_reduction_440 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_440: + jmp .L_last_blocks_done_412 +.L_last_num_blocks_is_15_412: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_441 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_441 + +.L_16_blocks_overflow_441: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_441: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_442 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_442 +.L_small_initial_partial_block_442: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_442: + + orq %r8,%r8 + je .L_after_reduction_442 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_442: + jmp .L_last_blocks_done_412 +.L_last_num_blocks_is_16_412: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_443 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_443 + +.L_16_blocks_overflow_443: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_443: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_444: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_444: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_444: + jmp .L_last_blocks_done_412 +.L_last_num_blocks_is_0_412: + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_412: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_334 + +.L_message_below_32_blocks_334: + + + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_445 + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) +.L_skip_hkeys_precomputation_445: + movq $1,%r14 + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_446 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_446 + jb .L_last_num_blocks_is_7_1_446 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_446 + jb .L_last_num_blocks_is_11_9_446 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_446 + ja .L_last_num_blocks_is_16_446 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_446 + jmp .L_last_num_blocks_is_13_446 + +.L_last_num_blocks_is_11_9_446: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_446 + ja .L_last_num_blocks_is_11_446 + jmp .L_last_num_blocks_is_9_446 + +.L_last_num_blocks_is_7_1_446: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_446 + jb .L_last_num_blocks_is_3_1_446 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_446 + je .L_last_num_blocks_is_6_446 + jmp .L_last_num_blocks_is_5_446 + +.L_last_num_blocks_is_3_1_446: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_446 + je .L_last_num_blocks_is_2_446 +.L_last_num_blocks_is_1_446: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_447 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_447 + +.L_16_blocks_overflow_447: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_447: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_448 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_448 +.L_small_initial_partial_block_448: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_448 +.L_small_initial_compute_done_448: +.L_after_reduction_448: + jmp .L_last_blocks_done_446 +.L_last_num_blocks_is_2_446: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_449 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_449 + +.L_16_blocks_overflow_449: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_449: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_450 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_450 +.L_small_initial_partial_block_450: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_450: + + orq %r8,%r8 + je .L_after_reduction_450 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_450: + jmp .L_last_blocks_done_446 +.L_last_num_blocks_is_3_446: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_451 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_451 + +.L_16_blocks_overflow_451: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_451: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_452 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_452 +.L_small_initial_partial_block_452: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_452: + + orq %r8,%r8 + je .L_after_reduction_452 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_452: + jmp .L_last_blocks_done_446 +.L_last_num_blocks_is_4_446: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_453 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_453 + +.L_16_blocks_overflow_453: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_453: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_454 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_454 +.L_small_initial_partial_block_454: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_454: + + orq %r8,%r8 + je .L_after_reduction_454 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_454: + jmp .L_last_blocks_done_446 +.L_last_num_blocks_is_5_446: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_455 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_455 + +.L_16_blocks_overflow_455: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_455: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %xmm29,%xmm3,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_456 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_456 +.L_small_initial_partial_block_456: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_456: + + orq %r8,%r8 + je .L_after_reduction_456 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_456: + jmp .L_last_blocks_done_446 +.L_last_num_blocks_is_6_446: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_457 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_457 + +.L_16_blocks_overflow_457: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_457: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %ymm29,%ymm3,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_458 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_458 +.L_small_initial_partial_block_458: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_458: + + orq %r8,%r8 + je .L_after_reduction_458 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_458: + jmp .L_last_blocks_done_446 +.L_last_num_blocks_is_7_446: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_459 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_459 + +.L_16_blocks_overflow_459: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_459: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_460 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_460 +.L_small_initial_partial_block_460: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_460: + + orq %r8,%r8 + je .L_after_reduction_460 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_460: + jmp .L_last_blocks_done_446 +.L_last_num_blocks_is_8_446: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_461 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_461 + +.L_16_blocks_overflow_461: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_461: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_462 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_462 +.L_small_initial_partial_block_462: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_462: + + orq %r8,%r8 + je .L_after_reduction_462 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_462: + jmp .L_last_blocks_done_446 +.L_last_num_blocks_is_9_446: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_463 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_463 + +.L_16_blocks_overflow_463: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_463: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %xmm29,%xmm4,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_464 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_464 +.L_small_initial_partial_block_464: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_464: + + orq %r8,%r8 + je .L_after_reduction_464 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_464: + jmp .L_last_blocks_done_446 +.L_last_num_blocks_is_10_446: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_465 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_465 + +.L_16_blocks_overflow_465: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_465: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %ymm29,%ymm4,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_466 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_466 +.L_small_initial_partial_block_466: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_466: + + orq %r8,%r8 + je .L_after_reduction_466 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_466: + jmp .L_last_blocks_done_446 +.L_last_num_blocks_is_11_446: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_467 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_467 + +.L_16_blocks_overflow_467: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_467: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_468 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_468 +.L_small_initial_partial_block_468: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_468: + + orq %r8,%r8 + je .L_after_reduction_468 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_468: + jmp .L_last_blocks_done_446 +.L_last_num_blocks_is_12_446: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_469 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_469 + +.L_16_blocks_overflow_469: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_469: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_470 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_470 +.L_small_initial_partial_block_470: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_470: + + orq %r8,%r8 + je .L_after_reduction_470 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_470: + jmp .L_last_blocks_done_446 +.L_last_num_blocks_is_13_446: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_471 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_471 + +.L_16_blocks_overflow_471: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_471: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %xmm29,%xmm5,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_472 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_472 +.L_small_initial_partial_block_472: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_472: + + orq %r8,%r8 + je .L_after_reduction_472 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_472: + jmp .L_last_blocks_done_446 +.L_last_num_blocks_is_14_446: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_473 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_473 + +.L_16_blocks_overflow_473: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_473: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %ymm29,%ymm5,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_474 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_474 +.L_small_initial_partial_block_474: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_474: + + orq %r8,%r8 + je .L_after_reduction_474 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_474: + jmp .L_last_blocks_done_446 +.L_last_num_blocks_is_15_446: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_475 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_475 + +.L_16_blocks_overflow_475: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_475: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_476 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_476 +.L_small_initial_partial_block_476: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_476: + + orq %r8,%r8 + je .L_after_reduction_476 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_476: + jmp .L_last_blocks_done_446 +.L_last_num_blocks_is_16_446: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_477 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_477 + +.L_16_blocks_overflow_477: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_477: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm17 + vpshufb %zmm29,%zmm3,%zmm19 + vpshufb %zmm29,%zmm4,%zmm20 + vpshufb %zmm29,%zmm5,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_478: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_478: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_478: + jmp .L_last_blocks_done_446 +.L_last_num_blocks_is_0_446: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_446: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_334 + +.L_message_below_equal_16_blocks_334: + + + movl %r8d,%r12d + addl $15,%r12d + shrl $4,%r12d + cmpq $8,%r12 + je .L_small_initial_num_blocks_is_8_479 + jl .L_small_initial_num_blocks_is_7_1_479 + + + cmpq $12,%r12 + je .L_small_initial_num_blocks_is_12_479 + jl .L_small_initial_num_blocks_is_11_9_479 + + + cmpq $16,%r12 + je .L_small_initial_num_blocks_is_16_479 + cmpq $15,%r12 + je .L_small_initial_num_blocks_is_15_479 + cmpq $14,%r12 + je .L_small_initial_num_blocks_is_14_479 + jmp .L_small_initial_num_blocks_is_13_479 + +.L_small_initial_num_blocks_is_11_9_479: + + cmpq $11,%r12 + je .L_small_initial_num_blocks_is_11_479 + cmpq $10,%r12 + je .L_small_initial_num_blocks_is_10_479 + jmp .L_small_initial_num_blocks_is_9_479 + +.L_small_initial_num_blocks_is_7_1_479: + cmpq $4,%r12 + je .L_small_initial_num_blocks_is_4_479 + jl .L_small_initial_num_blocks_is_3_1_479 + + cmpq $7,%r12 + je .L_small_initial_num_blocks_is_7_479 + cmpq $6,%r12 + je .L_small_initial_num_blocks_is_6_479 + jmp .L_small_initial_num_blocks_is_5_479 + +.L_small_initial_num_blocks_is_3_1_479: + + cmpq $3,%r12 + je .L_small_initial_num_blocks_is_3_479 + cmpq $2,%r12 + je .L_small_initial_num_blocks_is_2_479 + + + + + +.L_small_initial_num_blocks_is_1_479: + vmovdqa64 SHUF_MASK(%rip),%xmm29 + vpaddd ONE(%rip),%xmm2,%xmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm0,%xmm2 + vpshufb %xmm29,%xmm0,%xmm0 + vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %xmm15,%xmm0,%xmm0 + vpxorq %xmm6,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm0,%xmm6 + vextracti32x4 $0,%zmm6,%xmm13 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_480 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_480 +.L_small_initial_partial_block_480: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + + + + + + + + + + + + vpxorq %xmm13,%xmm14,%xmm14 + + jmp .L_after_reduction_480 +.L_small_initial_compute_done_480: +.L_after_reduction_480: + jmp .L_small_initial_blocks_encrypted_479 +.L_small_initial_num_blocks_is_2_479: + vmovdqa64 SHUF_MASK(%rip),%ymm29 + vshufi64x2 $0,%ymm2,%ymm2,%ymm0 + vpaddd ddq_add_1234(%rip),%ymm0,%ymm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm0,%xmm2 + vpshufb %ymm29,%ymm0,%ymm0 + vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %ymm15,%ymm0,%ymm0 + vpxorq %ymm6,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm0,%ymm6 + vextracti32x4 $1,%zmm6,%xmm13 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_481 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_481 +.L_small_initial_partial_block_481: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_481: + + orq %r8,%r8 + je .L_after_reduction_481 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_481: + jmp .L_small_initial_blocks_encrypted_479 +.L_small_initial_num_blocks_is_3_479: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vextracti32x4 $2,%zmm6,%xmm13 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_482 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_482 +.L_small_initial_partial_block_482: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_482: + + orq %r8,%r8 + je .L_after_reduction_482 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_482: + jmp .L_small_initial_blocks_encrypted_479 +.L_small_initial_num_blocks_is_4_479: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vextracti32x4 $3,%zmm6,%xmm13 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_483 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_483 +.L_small_initial_partial_block_483: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_483: + + orq %r8,%r8 + je .L_after_reduction_483 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_483: + jmp .L_small_initial_blocks_encrypted_479 +.L_small_initial_num_blocks_is_5_479: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %xmm15,%xmm3,%xmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %xmm7,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %xmm29,%xmm3,%xmm7 + vextracti32x4 $0,%zmm7,%xmm13 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_484 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_484 +.L_small_initial_partial_block_484: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_484: + + orq %r8,%r8 + je .L_after_reduction_484 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_484: + jmp .L_small_initial_blocks_encrypted_479 +.L_small_initial_num_blocks_is_6_479: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %ymm15,%ymm3,%ymm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %ymm7,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %ymm29,%ymm3,%ymm7 + vextracti32x4 $1,%zmm7,%xmm13 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_485 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_485 +.L_small_initial_partial_block_485: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_485: + + orq %r8,%r8 + je .L_after_reduction_485 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_485: + jmp .L_small_initial_blocks_encrypted_479 +.L_small_initial_num_blocks_is_7_479: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vextracti32x4 $2,%zmm7,%xmm13 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_486 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_486 +.L_small_initial_partial_block_486: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_486: + + orq %r8,%r8 + je .L_after_reduction_486 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_486: + jmp .L_small_initial_blocks_encrypted_479 +.L_small_initial_num_blocks_is_8_479: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vextracti32x4 $3,%zmm7,%xmm13 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_487 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_487 +.L_small_initial_partial_block_487: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_487: + + orq %r8,%r8 + je .L_after_reduction_487 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_487: + jmp .L_small_initial_blocks_encrypted_479 +.L_small_initial_num_blocks_is_9_479: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %xmm15,%xmm4,%xmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %xmm10,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %xmm29,%xmm4,%xmm10 + vextracti32x4 $0,%zmm10,%xmm13 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_488 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_488 +.L_small_initial_partial_block_488: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_488: + + orq %r8,%r8 + je .L_after_reduction_488 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_488: + jmp .L_small_initial_blocks_encrypted_479 +.L_small_initial_num_blocks_is_10_479: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %ymm15,%ymm4,%ymm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %ymm10,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %ymm29,%ymm4,%ymm10 + vextracti32x4 $1,%zmm10,%xmm13 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_489 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_489 +.L_small_initial_partial_block_489: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_489: + + orq %r8,%r8 + je .L_after_reduction_489 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_489: + jmp .L_small_initial_blocks_encrypted_479 +.L_small_initial_num_blocks_is_11_479: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vextracti32x4 $2,%zmm10,%xmm13 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_490 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_490 +.L_small_initial_partial_block_490: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_490: + + orq %r8,%r8 + je .L_after_reduction_490 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_490: + jmp .L_small_initial_blocks_encrypted_479 +.L_small_initial_num_blocks_is_12_479: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vextracti32x4 $3,%zmm10,%xmm13 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_491 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_491 +.L_small_initial_partial_block_491: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_491: + + orq %r8,%r8 + je .L_after_reduction_491 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_491: + jmp .L_small_initial_blocks_encrypted_479 +.L_small_initial_num_blocks_is_13_479: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %xmm15,%xmm5,%xmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %xmm11,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %xmm29,%xmm5,%xmm11 + vextracti32x4 $0,%zmm11,%xmm13 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_492 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_492 +.L_small_initial_partial_block_492: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_492: + + orq %r8,%r8 + je .L_after_reduction_492 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_492: + jmp .L_small_initial_blocks_encrypted_479 +.L_small_initial_num_blocks_is_14_479: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %ymm15,%ymm5,%ymm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %ymm11,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %ymm29,%ymm5,%ymm11 + vextracti32x4 $1,%zmm11,%xmm13 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_493 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_493 +.L_small_initial_partial_block_493: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_493: + + orq %r8,%r8 + je .L_after_reduction_493 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_493: + jmp .L_small_initial_blocks_encrypted_479 +.L_small_initial_num_blocks_is_15_479: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %zmm29,%zmm5,%zmm11 + vextracti32x4 $2,%zmm11,%xmm13 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_494 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_494 +.L_small_initial_partial_block_494: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_494: + + orq %r8,%r8 + je .L_after_reduction_494 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_494: + jmp .L_small_initial_blocks_encrypted_479 +.L_small_initial_num_blocks_is_16_479: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm0,%zmm6 + vpshufb %zmm29,%zmm3,%zmm7 + vpshufb %zmm29,%zmm4,%zmm10 + vpshufb %zmm29,%zmm5,%zmm11 + vextracti32x4 $3,%zmm11,%xmm13 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_495: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_495: + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_495: +.L_small_initial_blocks_encrypted_479: +.L_ghash_done_334: + vmovdqu64 %xmm2,0(%rsi) + vmovdqu64 %xmm14,64(%rsi) +.L_enc_dec_done_334: + jmp .Lexit_gcm_encrypt +.Lexit_gcm_encrypt: + cmpq $256,%r8 + jbe .Lskip_hkeys_cleanup_496 + vpxor %xmm0,%xmm0,%xmm0 + vmovdqa64 %zmm0,0(%rsp) + vmovdqa64 %zmm0,64(%rsp) + vmovdqa64 %zmm0,128(%rsp) + vmovdqa64 %zmm0,192(%rsp) + vmovdqa64 %zmm0,256(%rsp) + vmovdqa64 %zmm0,320(%rsp) + vmovdqa64 %zmm0,384(%rsp) + vmovdqa64 %zmm0,448(%rsp) + vmovdqa64 %zmm0,512(%rsp) + vmovdqa64 %zmm0,576(%rsp) + vmovdqa64 %zmm0,640(%rsp) + vmovdqa64 %zmm0,704(%rsp) +.Lskip_hkeys_cleanup_496: + vzeroupper + leaq (%rbp),%rsp +.cfi_def_cfa_register %rsp + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx + .byte 0xf3,0xc3 +.Lencrypt_seh_end: +.cfi_endproc +.size ossl_aes_gcm_encrypt_avx512, .-ossl_aes_gcm_encrypt_avx512 +.globl ossl_aes_gcm_decrypt_avx512 +.type ossl_aes_gcm_decrypt_avx512,@function +.align 32 +ossl_aes_gcm_decrypt_avx512: +.cfi_startproc +.Ldecrypt_seh_begin: +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 +.Ldecrypt_seh_push_rbx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 +.Ldecrypt_seh_push_rbp: + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 +.Ldecrypt_seh_push_r12: + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 +.Ldecrypt_seh_push_r13: + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 +.Ldecrypt_seh_push_r14: + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Ldecrypt_seh_push_r15: + + + + + + + + + + + leaq 0(%rsp),%rbp +.cfi_def_cfa_register %rbp +.Ldecrypt_seh_setfp: + +.Ldecrypt_seh_prolog_end: + subq $1588,%rsp + andq $(-64),%rsp + + + movl 240(%rdi),%eax + cmpl $9,%eax + je .Laes_gcm_decrypt_128_avx512 + cmpl $11,%eax + je .Laes_gcm_decrypt_192_avx512 + cmpl $13,%eax + je .Laes_gcm_decrypt_256_avx512 + xorl %eax,%eax + jmp .Lexit_gcm_decrypt +.align 32 +.Laes_gcm_decrypt_128_avx512: + orq %r8,%r8 + je .L_enc_dec_done_497 + xorq %r14,%r14 + vmovdqu64 64(%rsi),%xmm14 + + movq (%rdx),%r11 + orq %r11,%r11 + je .L_partial_block_done_498 + movl $16,%r10d + leaq byte_len_to_mask_table(%rip),%r12 + cmpq %r10,%r8 + cmovcq %r8,%r10 + kmovw (%r12,%r10,2),%k1 + vmovdqu8 (%rcx),%xmm0{%k1}{z} + + vmovdqu64 16(%rsi),%xmm3 + vmovdqu64 336(%rsi),%xmm4 + + + + leaq SHIFT_MASK(%rip),%r12 + addq %r11,%r12 + vmovdqu64 (%r12),%xmm5 + vpshufb %xmm5,%xmm3,%xmm3 + + vmovdqa64 %xmm0,%xmm6 + vpxorq %xmm0,%xmm3,%xmm3 + + + leaq (%r8,%r11,1),%r13 + subq $16,%r13 + jge .L_no_extra_mask_498 + subq %r13,%r12 +.L_no_extra_mask_498: + + + + vmovdqu64 16(%r12),%xmm0 + vpand %xmm0,%xmm3,%xmm3 + vpand %xmm0,%xmm6,%xmm6 + vpshufb SHUF_MASK(%rip),%xmm6,%xmm6 + vpshufb %xmm5,%xmm6,%xmm6 + vpxorq %xmm6,%xmm14,%xmm14 + cmpq $0,%r13 + jl .L_partial_incomplete_498 + + vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7 + vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10 + vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11 + vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14 + vpxorq %xmm11,%xmm14,%xmm14 + + vpsrldq $8,%xmm14,%xmm11 + vpslldq $8,%xmm14,%xmm14 + vpxorq %xmm11,%xmm7,%xmm7 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vmovdqu64 POLY2(%rip),%xmm11 + + vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10 + vpslldq $8,%xmm10,%xmm10 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10 + vpsrldq $4,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14 + vpslldq $4,%xmm14,%xmm14 + + vpternlogq $0x96,%xmm10,%xmm7,%xmm14 + + movq $0,(%rdx) + + movq %r11,%r12 + movq $16,%r11 + subq %r12,%r11 + jmp .L_enc_dec_done_498 + +.L_partial_incomplete_498: + addq %r8,(%rdx) + movq %r8,%r11 + +.L_enc_dec_done_498: + + + leaq byte_len_to_mask_table(%rip),%r12 + kmovw (%r12,%r11,2),%k1 + vmovdqu64 %xmm14,64(%rsi) + movq %r9,%r12 + vmovdqu8 %xmm3,(%r12){%k1} +.L_partial_block_done_498: + vmovdqu64 0(%rsi),%xmm2 + subq %r11,%r8 + je .L_enc_dec_done_497 + cmpq $256,%r8 + jbe .L_message_below_equal_16_blocks_497 + + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vmovdqa64 ddq_addbe_4444(%rip),%zmm27 + vmovdqa64 ddq_addbe_1234(%rip),%zmm28 + + + + + + + vmovd %xmm2,%r15d + andl $255,%r15d + + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpshufb %zmm29,%zmm2,%zmm2 + + + + cmpb $240,%r15b + jae .L_next_16_overflow_499 + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_499 +.L_next_16_overflow_499: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_499: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 0(%rcx,%r11,1),%zmm0 + vmovdqu8 64(%rcx,%r11,1),%zmm3 + vmovdqu8 128(%rcx,%r11,1),%zmm4 + vmovdqu8 192(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,0(%r10,%r11,1) + vmovdqu8 %zmm10,64(%r10,%r11,1) + vmovdqu8 %zmm11,128(%r10,%r11,1) + vmovdqu8 %zmm12,192(%r10,%r11,1) + + vpshufb %zmm29,%zmm0,%zmm7 + vpshufb %zmm29,%zmm3,%zmm10 + vpshufb %zmm29,%zmm4,%zmm11 + vpshufb %zmm29,%zmm5,%zmm12 + vmovdqa64 %zmm7,768(%rsp) + vmovdqa64 %zmm10,832(%rsp) + vmovdqa64 %zmm11,896(%rsp) + vmovdqa64 %zmm12,960(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_500 + + vmovdqu64 288(%rsi),%zmm0 + vmovdqu64 %zmm0,704(%rsp) + + vmovdqu64 224(%rsi),%zmm3 + vmovdqu64 %zmm3,640(%rsp) + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 160(%rsi),%zmm4 + vmovdqu64 %zmm4,576(%rsp) + + vmovdqu64 96(%rsi),%zmm5 + vmovdqu64 %zmm5,512(%rsp) +.L_skip_hkeys_precomputation_500: + cmpq $512,%r8 + jb .L_message_below_32_blocks_497 + + + + cmpb $240,%r15b + jae .L_next_16_overflow_501 + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_501 +.L_next_16_overflow_501: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_501: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 256(%rcx,%r11,1),%zmm0 + vmovdqu8 320(%rcx,%r11,1),%zmm3 + vmovdqu8 384(%rcx,%r11,1),%zmm4 + vmovdqu8 448(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,256(%r10,%r11,1) + vmovdqu8 %zmm10,320(%r10,%r11,1) + vmovdqu8 %zmm11,384(%r10,%r11,1) + vmovdqu8 %zmm12,448(%r10,%r11,1) + + vpshufb %zmm29,%zmm0,%zmm7 + vpshufb %zmm29,%zmm3,%zmm10 + vpshufb %zmm29,%zmm4,%zmm11 + vpshufb %zmm29,%zmm5,%zmm12 + vmovdqa64 %zmm7,1024(%rsp) + vmovdqa64 %zmm10,1088(%rsp) + vmovdqa64 %zmm11,1152(%rsp) + vmovdqa64 %zmm12,1216(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_502 + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,192(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,128(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,64(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,0(%rsp) +.L_skip_hkeys_precomputation_502: + movq $1,%r14 + addq $512,%r11 + subq $512,%r8 + + cmpq $768,%r8 + jb .L_no_more_big_nblocks_497 +.L_encrypt_big_nblocks_497: + cmpb $240,%r15b + jae .L_16_blocks_overflow_503 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_503 +.L_16_blocks_overflow_503: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_503: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_504 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_504 +.L_16_blocks_overflow_504: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_504: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_505 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_505 +.L_16_blocks_overflow_505: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_505: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 512(%rcx,%r11,1),%zmm17 + vmovdqu8 576(%rcx,%r11,1),%zmm19 + vmovdqu8 640(%rcx,%r11,1),%zmm20 + vmovdqu8 704(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + + + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpternlogq $0x96,%zmm15,%zmm12,%zmm6 + vpxorq %zmm24,%zmm6,%zmm6 + vpternlogq $0x96,%zmm10,%zmm13,%zmm7 + vpxorq %zmm25,%zmm7,%zmm7 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vextracti64x4 $1,%zmm6,%ymm12 + vpxorq %ymm12,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm12 + vpxorq %xmm12,%xmm6,%xmm6 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm6 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,512(%r10,%r11,1) + vmovdqu8 %zmm3,576(%r10,%r11,1) + vmovdqu8 %zmm4,640(%r10,%r11,1) + vmovdqu8 %zmm5,704(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1024(%rsp) + vmovdqa64 %zmm3,1088(%rsp) + vmovdqa64 %zmm4,1152(%rsp) + vmovdqa64 %zmm5,1216(%rsp) + vmovdqa64 %zmm6,%zmm14 + + addq $768,%r11 + subq $768,%r8 + cmpq $768,%r8 + jae .L_encrypt_big_nblocks_497 + +.L_no_more_big_nblocks_497: + + cmpq $512,%r8 + jae .L_encrypt_32_blocks_497 + + cmpq $256,%r8 + jae .L_encrypt_16_blocks_497 +.L_encrypt_0_blocks_ghash_32_497: + movl %r8d,%r10d + andl $~15,%r10d + movl $256,%ebx + subl %r10d,%ebx + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + addl $256,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_506 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_506 + jb .L_last_num_blocks_is_7_1_506 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_506 + jb .L_last_num_blocks_is_11_9_506 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_506 + ja .L_last_num_blocks_is_16_506 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_506 + jmp .L_last_num_blocks_is_13_506 + +.L_last_num_blocks_is_11_9_506: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_506 + ja .L_last_num_blocks_is_11_506 + jmp .L_last_num_blocks_is_9_506 + +.L_last_num_blocks_is_7_1_506: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_506 + jb .L_last_num_blocks_is_3_1_506 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_506 + je .L_last_num_blocks_is_6_506 + jmp .L_last_num_blocks_is_5_506 + +.L_last_num_blocks_is_3_1_506: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_506 + je .L_last_num_blocks_is_2_506 +.L_last_num_blocks_is_1_506: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_507 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_507 + +.L_16_blocks_overflow_507: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_507: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_508 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_508 +.L_small_initial_partial_block_508: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_508 +.L_small_initial_compute_done_508: +.L_after_reduction_508: + jmp .L_last_blocks_done_506 +.L_last_num_blocks_is_2_506: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_509 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_509 + +.L_16_blocks_overflow_509: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_509: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_510 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_510 +.L_small_initial_partial_block_510: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_510: + + orq %r8,%r8 + je .L_after_reduction_510 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_510: + jmp .L_last_blocks_done_506 +.L_last_num_blocks_is_3_506: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_511 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_511 + +.L_16_blocks_overflow_511: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_511: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_512 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_512 +.L_small_initial_partial_block_512: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_512: + + orq %r8,%r8 + je .L_after_reduction_512 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_512: + jmp .L_last_blocks_done_506 +.L_last_num_blocks_is_4_506: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_513 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_513 + +.L_16_blocks_overflow_513: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_513: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_514 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_514 +.L_small_initial_partial_block_514: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_514: + + orq %r8,%r8 + je .L_after_reduction_514 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_514: + jmp .L_last_blocks_done_506 +.L_last_num_blocks_is_5_506: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_515 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_515 + +.L_16_blocks_overflow_515: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_515: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_516 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_516 +.L_small_initial_partial_block_516: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_516: + + orq %r8,%r8 + je .L_after_reduction_516 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_516: + jmp .L_last_blocks_done_506 +.L_last_num_blocks_is_6_506: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_517 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_517 + +.L_16_blocks_overflow_517: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_517: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_518 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_518 +.L_small_initial_partial_block_518: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_518: + + orq %r8,%r8 + je .L_after_reduction_518 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_518: + jmp .L_last_blocks_done_506 +.L_last_num_blocks_is_7_506: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_519 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_519 + +.L_16_blocks_overflow_519: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_519: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_520 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_520 +.L_small_initial_partial_block_520: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_520: + + orq %r8,%r8 + je .L_after_reduction_520 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_520: + jmp .L_last_blocks_done_506 +.L_last_num_blocks_is_8_506: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_521 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_521 + +.L_16_blocks_overflow_521: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_521: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_522 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_522 +.L_small_initial_partial_block_522: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_522: + + orq %r8,%r8 + je .L_after_reduction_522 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_522: + jmp .L_last_blocks_done_506 +.L_last_num_blocks_is_9_506: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_523 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_523 + +.L_16_blocks_overflow_523: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_523: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_524 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_524 +.L_small_initial_partial_block_524: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_524: + + orq %r8,%r8 + je .L_after_reduction_524 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_524: + jmp .L_last_blocks_done_506 +.L_last_num_blocks_is_10_506: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_525 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_525 + +.L_16_blocks_overflow_525: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_525: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_526 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_526 +.L_small_initial_partial_block_526: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_526: + + orq %r8,%r8 + je .L_after_reduction_526 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_526: + jmp .L_last_blocks_done_506 +.L_last_num_blocks_is_11_506: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_527 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_527 + +.L_16_blocks_overflow_527: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_527: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_528 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_528 +.L_small_initial_partial_block_528: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_528: + + orq %r8,%r8 + je .L_after_reduction_528 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_528: + jmp .L_last_blocks_done_506 +.L_last_num_blocks_is_12_506: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_529 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_529 + +.L_16_blocks_overflow_529: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_529: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_530 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_530 +.L_small_initial_partial_block_530: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_530: + + orq %r8,%r8 + je .L_after_reduction_530 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_530: + jmp .L_last_blocks_done_506 +.L_last_num_blocks_is_13_506: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_531 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_531 + +.L_16_blocks_overflow_531: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_531: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_532 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_532 +.L_small_initial_partial_block_532: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_532: + + orq %r8,%r8 + je .L_after_reduction_532 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_532: + jmp .L_last_blocks_done_506 +.L_last_num_blocks_is_14_506: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_533 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_533 + +.L_16_blocks_overflow_533: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_533: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_534 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_534 +.L_small_initial_partial_block_534: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_534: + + orq %r8,%r8 + je .L_after_reduction_534 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_534: + jmp .L_last_blocks_done_506 +.L_last_num_blocks_is_15_506: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_535 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_535 + +.L_16_blocks_overflow_535: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_535: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_536 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_536 +.L_small_initial_partial_block_536: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_536: + + orq %r8,%r8 + je .L_after_reduction_536 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_536: + jmp .L_last_blocks_done_506 +.L_last_num_blocks_is_16_506: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_537 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_537 + +.L_16_blocks_overflow_537: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_537: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_538: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_538: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_538: + jmp .L_last_blocks_done_506 +.L_last_num_blocks_is_0_506: + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_506: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_497 +.L_encrypt_32_blocks_497: + cmpb $240,%r15b + jae .L_16_blocks_overflow_539 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_539 +.L_16_blocks_overflow_539: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_539: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_540 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_540 +.L_16_blocks_overflow_540: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_540: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + + subq $512,%r8 + addq $512,%r11 + movl %r8d,%r10d + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_541 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_541 + jb .L_last_num_blocks_is_7_1_541 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_541 + jb .L_last_num_blocks_is_11_9_541 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_541 + ja .L_last_num_blocks_is_16_541 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_541 + jmp .L_last_num_blocks_is_13_541 + +.L_last_num_blocks_is_11_9_541: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_541 + ja .L_last_num_blocks_is_11_541 + jmp .L_last_num_blocks_is_9_541 + +.L_last_num_blocks_is_7_1_541: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_541 + jb .L_last_num_blocks_is_3_1_541 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_541 + je .L_last_num_blocks_is_6_541 + jmp .L_last_num_blocks_is_5_541 + +.L_last_num_blocks_is_3_1_541: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_541 + je .L_last_num_blocks_is_2_541 +.L_last_num_blocks_is_1_541: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_542 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_542 + +.L_16_blocks_overflow_542: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_542: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_543 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_543 +.L_small_initial_partial_block_543: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_543 +.L_small_initial_compute_done_543: +.L_after_reduction_543: + jmp .L_last_blocks_done_541 +.L_last_num_blocks_is_2_541: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_544 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_544 + +.L_16_blocks_overflow_544: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_544: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_545 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_545 +.L_small_initial_partial_block_545: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_545: + + orq %r8,%r8 + je .L_after_reduction_545 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_545: + jmp .L_last_blocks_done_541 +.L_last_num_blocks_is_3_541: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_546 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_546 + +.L_16_blocks_overflow_546: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_546: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_547 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_547 +.L_small_initial_partial_block_547: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_547: + + orq %r8,%r8 + je .L_after_reduction_547 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_547: + jmp .L_last_blocks_done_541 +.L_last_num_blocks_is_4_541: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_548 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_548 + +.L_16_blocks_overflow_548: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_548: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_549 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_549 +.L_small_initial_partial_block_549: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_549: + + orq %r8,%r8 + je .L_after_reduction_549 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_549: + jmp .L_last_blocks_done_541 +.L_last_num_blocks_is_5_541: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_550 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_550 + +.L_16_blocks_overflow_550: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_550: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_551 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_551 +.L_small_initial_partial_block_551: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_551: + + orq %r8,%r8 + je .L_after_reduction_551 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_551: + jmp .L_last_blocks_done_541 +.L_last_num_blocks_is_6_541: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_552 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_552 + +.L_16_blocks_overflow_552: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_552: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_553 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_553 +.L_small_initial_partial_block_553: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_553: + + orq %r8,%r8 + je .L_after_reduction_553 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_553: + jmp .L_last_blocks_done_541 +.L_last_num_blocks_is_7_541: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_554 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_554 + +.L_16_blocks_overflow_554: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_554: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_555 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_555 +.L_small_initial_partial_block_555: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_555: + + orq %r8,%r8 + je .L_after_reduction_555 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_555: + jmp .L_last_blocks_done_541 +.L_last_num_blocks_is_8_541: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_556 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_556 + +.L_16_blocks_overflow_556: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_556: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_557 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_557 +.L_small_initial_partial_block_557: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_557: + + orq %r8,%r8 + je .L_after_reduction_557 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_557: + jmp .L_last_blocks_done_541 +.L_last_num_blocks_is_9_541: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_558 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_558 + +.L_16_blocks_overflow_558: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_558: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_559 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_559 +.L_small_initial_partial_block_559: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_559: + + orq %r8,%r8 + je .L_after_reduction_559 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_559: + jmp .L_last_blocks_done_541 +.L_last_num_blocks_is_10_541: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_560 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_560 + +.L_16_blocks_overflow_560: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_560: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_561 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_561 +.L_small_initial_partial_block_561: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_561: + + orq %r8,%r8 + je .L_after_reduction_561 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_561: + jmp .L_last_blocks_done_541 +.L_last_num_blocks_is_11_541: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_562 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_562 + +.L_16_blocks_overflow_562: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_562: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_563 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_563 +.L_small_initial_partial_block_563: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_563: + + orq %r8,%r8 + je .L_after_reduction_563 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_563: + jmp .L_last_blocks_done_541 +.L_last_num_blocks_is_12_541: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_564 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_564 + +.L_16_blocks_overflow_564: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_564: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_565 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_565 +.L_small_initial_partial_block_565: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_565: + + orq %r8,%r8 + je .L_after_reduction_565 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_565: + jmp .L_last_blocks_done_541 +.L_last_num_blocks_is_13_541: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_566 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_566 + +.L_16_blocks_overflow_566: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_566: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_567 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_567 +.L_small_initial_partial_block_567: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_567: + + orq %r8,%r8 + je .L_after_reduction_567 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_567: + jmp .L_last_blocks_done_541 +.L_last_num_blocks_is_14_541: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_568 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_568 + +.L_16_blocks_overflow_568: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_568: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_569 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_569 +.L_small_initial_partial_block_569: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_569: + + orq %r8,%r8 + je .L_after_reduction_569 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_569: + jmp .L_last_blocks_done_541 +.L_last_num_blocks_is_15_541: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_570 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_570 + +.L_16_blocks_overflow_570: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_570: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_571 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_571 +.L_small_initial_partial_block_571: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_571: + + orq %r8,%r8 + je .L_after_reduction_571 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_571: + jmp .L_last_blocks_done_541 +.L_last_num_blocks_is_16_541: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_572 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_572 + +.L_16_blocks_overflow_572: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_572: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_573: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_573: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_573: + jmp .L_last_blocks_done_541 +.L_last_num_blocks_is_0_541: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_541: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_497 +.L_encrypt_16_blocks_497: + cmpb $240,%r15b + jae .L_16_blocks_overflow_574 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_574 +.L_16_blocks_overflow_574: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_574: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 256(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 320(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 384(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 448(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_575 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_575 + jb .L_last_num_blocks_is_7_1_575 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_575 + jb .L_last_num_blocks_is_11_9_575 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_575 + ja .L_last_num_blocks_is_16_575 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_575 + jmp .L_last_num_blocks_is_13_575 + +.L_last_num_blocks_is_11_9_575: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_575 + ja .L_last_num_blocks_is_11_575 + jmp .L_last_num_blocks_is_9_575 + +.L_last_num_blocks_is_7_1_575: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_575 + jb .L_last_num_blocks_is_3_1_575 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_575 + je .L_last_num_blocks_is_6_575 + jmp .L_last_num_blocks_is_5_575 + +.L_last_num_blocks_is_3_1_575: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_575 + je .L_last_num_blocks_is_2_575 +.L_last_num_blocks_is_1_575: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_576 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_576 + +.L_16_blocks_overflow_576: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_576: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %xmm31,%xmm0,%xmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_577 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_577 +.L_small_initial_partial_block_577: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_577 +.L_small_initial_compute_done_577: +.L_after_reduction_577: + jmp .L_last_blocks_done_575 +.L_last_num_blocks_is_2_575: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_578 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_578 + +.L_16_blocks_overflow_578: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_578: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %ymm31,%ymm0,%ymm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_579 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_579 +.L_small_initial_partial_block_579: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_579: + + orq %r8,%r8 + je .L_after_reduction_579 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_579: + jmp .L_last_blocks_done_575 +.L_last_num_blocks_is_3_575: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_580 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_580 + +.L_16_blocks_overflow_580: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_580: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_581 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_581 +.L_small_initial_partial_block_581: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_581: + + orq %r8,%r8 + je .L_after_reduction_581 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_581: + jmp .L_last_blocks_done_575 +.L_last_num_blocks_is_4_575: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_582 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_582 + +.L_16_blocks_overflow_582: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_582: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_583 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_583 +.L_small_initial_partial_block_583: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_583: + + orq %r8,%r8 + je .L_after_reduction_583 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_583: + jmp .L_last_blocks_done_575 +.L_last_num_blocks_is_5_575: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_584 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_584 + +.L_16_blocks_overflow_584: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_584: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_585 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_585 +.L_small_initial_partial_block_585: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_585: + + orq %r8,%r8 + je .L_after_reduction_585 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_585: + jmp .L_last_blocks_done_575 +.L_last_num_blocks_is_6_575: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_586 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_586 + +.L_16_blocks_overflow_586: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_586: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_587 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_587 +.L_small_initial_partial_block_587: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_587: + + orq %r8,%r8 + je .L_after_reduction_587 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_587: + jmp .L_last_blocks_done_575 +.L_last_num_blocks_is_7_575: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_588 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_588 + +.L_16_blocks_overflow_588: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_588: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_589 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_589 +.L_small_initial_partial_block_589: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_589: + + orq %r8,%r8 + je .L_after_reduction_589 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_589: + jmp .L_last_blocks_done_575 +.L_last_num_blocks_is_8_575: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_590 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_590 + +.L_16_blocks_overflow_590: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_590: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_591 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_591 +.L_small_initial_partial_block_591: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_591: + + orq %r8,%r8 + je .L_after_reduction_591 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_591: + jmp .L_last_blocks_done_575 +.L_last_num_blocks_is_9_575: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_592 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_592 + +.L_16_blocks_overflow_592: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_592: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_593 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_593 +.L_small_initial_partial_block_593: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_593: + + orq %r8,%r8 + je .L_after_reduction_593 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_593: + jmp .L_last_blocks_done_575 +.L_last_num_blocks_is_10_575: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_594 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_594 + +.L_16_blocks_overflow_594: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_594: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_595 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_595 +.L_small_initial_partial_block_595: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_595: + + orq %r8,%r8 + je .L_after_reduction_595 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_595: + jmp .L_last_blocks_done_575 +.L_last_num_blocks_is_11_575: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_596 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_596 + +.L_16_blocks_overflow_596: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_596: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_597 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_597 +.L_small_initial_partial_block_597: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_597: + + orq %r8,%r8 + je .L_after_reduction_597 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_597: + jmp .L_last_blocks_done_575 +.L_last_num_blocks_is_12_575: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_598 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_598 + +.L_16_blocks_overflow_598: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_598: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_599 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_599 +.L_small_initial_partial_block_599: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_599: + + orq %r8,%r8 + je .L_after_reduction_599 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_599: + jmp .L_last_blocks_done_575 +.L_last_num_blocks_is_13_575: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_600 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_600 + +.L_16_blocks_overflow_600: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_600: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_601 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_601 +.L_small_initial_partial_block_601: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_601: + + orq %r8,%r8 + je .L_after_reduction_601 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_601: + jmp .L_last_blocks_done_575 +.L_last_num_blocks_is_14_575: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_602 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_602 + +.L_16_blocks_overflow_602: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_602: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_603 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_603 +.L_small_initial_partial_block_603: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_603: + + orq %r8,%r8 + je .L_after_reduction_603 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_603: + jmp .L_last_blocks_done_575 +.L_last_num_blocks_is_15_575: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_604 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_604 + +.L_16_blocks_overflow_604: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_604: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_605 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_605 +.L_small_initial_partial_block_605: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_605: + + orq %r8,%r8 + je .L_after_reduction_605 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_605: + jmp .L_last_blocks_done_575 +.L_last_num_blocks_is_16_575: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_606 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_606 + +.L_16_blocks_overflow_606: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_606: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_607: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_607: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_607: + jmp .L_last_blocks_done_575 +.L_last_num_blocks_is_0_575: + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_575: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_497 + +.L_message_below_32_blocks_497: + + + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_608 + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) +.L_skip_hkeys_precomputation_608: + movq $1,%r14 + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_609 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_609 + jb .L_last_num_blocks_is_7_1_609 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_609 + jb .L_last_num_blocks_is_11_9_609 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_609 + ja .L_last_num_blocks_is_16_609 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_609 + jmp .L_last_num_blocks_is_13_609 + +.L_last_num_blocks_is_11_9_609: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_609 + ja .L_last_num_blocks_is_11_609 + jmp .L_last_num_blocks_is_9_609 + +.L_last_num_blocks_is_7_1_609: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_609 + jb .L_last_num_blocks_is_3_1_609 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_609 + je .L_last_num_blocks_is_6_609 + jmp .L_last_num_blocks_is_5_609 + +.L_last_num_blocks_is_3_1_609: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_609 + je .L_last_num_blocks_is_2_609 +.L_last_num_blocks_is_1_609: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_610 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_610 + +.L_16_blocks_overflow_610: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_610: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_611 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_611 +.L_small_initial_partial_block_611: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_611 +.L_small_initial_compute_done_611: +.L_after_reduction_611: + jmp .L_last_blocks_done_609 +.L_last_num_blocks_is_2_609: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_612 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_612 + +.L_16_blocks_overflow_612: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_612: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_613 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_613 +.L_small_initial_partial_block_613: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_613: + + orq %r8,%r8 + je .L_after_reduction_613 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_613: + jmp .L_last_blocks_done_609 +.L_last_num_blocks_is_3_609: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_614 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_614 + +.L_16_blocks_overflow_614: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_614: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_615 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_615 +.L_small_initial_partial_block_615: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_615: + + orq %r8,%r8 + je .L_after_reduction_615 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_615: + jmp .L_last_blocks_done_609 +.L_last_num_blocks_is_4_609: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_616 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_616 + +.L_16_blocks_overflow_616: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_616: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_617 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_617 +.L_small_initial_partial_block_617: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_617: + + orq %r8,%r8 + je .L_after_reduction_617 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_617: + jmp .L_last_blocks_done_609 +.L_last_num_blocks_is_5_609: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_618 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_618 + +.L_16_blocks_overflow_618: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_618: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_619 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_619 +.L_small_initial_partial_block_619: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_619: + + orq %r8,%r8 + je .L_after_reduction_619 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_619: + jmp .L_last_blocks_done_609 +.L_last_num_blocks_is_6_609: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_620 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_620 + +.L_16_blocks_overflow_620: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_620: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_621 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_621 +.L_small_initial_partial_block_621: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_621: + + orq %r8,%r8 + je .L_after_reduction_621 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_621: + jmp .L_last_blocks_done_609 +.L_last_num_blocks_is_7_609: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_622 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_622 + +.L_16_blocks_overflow_622: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_622: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_623 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_623 +.L_small_initial_partial_block_623: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_623: + + orq %r8,%r8 + je .L_after_reduction_623 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_623: + jmp .L_last_blocks_done_609 +.L_last_num_blocks_is_8_609: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_624 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_624 + +.L_16_blocks_overflow_624: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_624: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_625 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_625 +.L_small_initial_partial_block_625: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_625: + + orq %r8,%r8 + je .L_after_reduction_625 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_625: + jmp .L_last_blocks_done_609 +.L_last_num_blocks_is_9_609: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_626 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_626 + +.L_16_blocks_overflow_626: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_626: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_627 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_627 +.L_small_initial_partial_block_627: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_627: + + orq %r8,%r8 + je .L_after_reduction_627 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_627: + jmp .L_last_blocks_done_609 +.L_last_num_blocks_is_10_609: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_628 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_628 + +.L_16_blocks_overflow_628: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_628: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_629 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_629 +.L_small_initial_partial_block_629: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_629: + + orq %r8,%r8 + je .L_after_reduction_629 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_629: + jmp .L_last_blocks_done_609 +.L_last_num_blocks_is_11_609: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_630 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_630 + +.L_16_blocks_overflow_630: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_630: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_631 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_631 +.L_small_initial_partial_block_631: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_631: + + orq %r8,%r8 + je .L_after_reduction_631 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_631: + jmp .L_last_blocks_done_609 +.L_last_num_blocks_is_12_609: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_632 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_632 + +.L_16_blocks_overflow_632: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_632: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_633 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_633 +.L_small_initial_partial_block_633: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_633: + + orq %r8,%r8 + je .L_after_reduction_633 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_633: + jmp .L_last_blocks_done_609 +.L_last_num_blocks_is_13_609: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_634 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_634 + +.L_16_blocks_overflow_634: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_634: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_635 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_635 +.L_small_initial_partial_block_635: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_635: + + orq %r8,%r8 + je .L_after_reduction_635 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_635: + jmp .L_last_blocks_done_609 +.L_last_num_blocks_is_14_609: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_636 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_636 + +.L_16_blocks_overflow_636: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_636: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_637 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_637 +.L_small_initial_partial_block_637: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_637: + + orq %r8,%r8 + je .L_after_reduction_637 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_637: + jmp .L_last_blocks_done_609 +.L_last_num_blocks_is_15_609: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_638 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_638 + +.L_16_blocks_overflow_638: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_638: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_639 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_639 +.L_small_initial_partial_block_639: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_639: + + orq %r8,%r8 + je .L_after_reduction_639 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_639: + jmp .L_last_blocks_done_609 +.L_last_num_blocks_is_16_609: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_640 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_640 + +.L_16_blocks_overflow_640: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_640: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_641: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_641: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_641: + jmp .L_last_blocks_done_609 +.L_last_num_blocks_is_0_609: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_609: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_497 + +.L_message_below_equal_16_blocks_497: + + + movl %r8d,%r12d + addl $15,%r12d + shrl $4,%r12d + cmpq $8,%r12 + je .L_small_initial_num_blocks_is_8_642 + jl .L_small_initial_num_blocks_is_7_1_642 + + + cmpq $12,%r12 + je .L_small_initial_num_blocks_is_12_642 + jl .L_small_initial_num_blocks_is_11_9_642 + + + cmpq $16,%r12 + je .L_small_initial_num_blocks_is_16_642 + cmpq $15,%r12 + je .L_small_initial_num_blocks_is_15_642 + cmpq $14,%r12 + je .L_small_initial_num_blocks_is_14_642 + jmp .L_small_initial_num_blocks_is_13_642 + +.L_small_initial_num_blocks_is_11_9_642: + + cmpq $11,%r12 + je .L_small_initial_num_blocks_is_11_642 + cmpq $10,%r12 + je .L_small_initial_num_blocks_is_10_642 + jmp .L_small_initial_num_blocks_is_9_642 + +.L_small_initial_num_blocks_is_7_1_642: + cmpq $4,%r12 + je .L_small_initial_num_blocks_is_4_642 + jl .L_small_initial_num_blocks_is_3_1_642 + + cmpq $7,%r12 + je .L_small_initial_num_blocks_is_7_642 + cmpq $6,%r12 + je .L_small_initial_num_blocks_is_6_642 + jmp .L_small_initial_num_blocks_is_5_642 + +.L_small_initial_num_blocks_is_3_1_642: + + cmpq $3,%r12 + je .L_small_initial_num_blocks_is_3_642 + cmpq $2,%r12 + je .L_small_initial_num_blocks_is_2_642 + + + + + +.L_small_initial_num_blocks_is_1_642: + vmovdqa64 SHUF_MASK(%rip),%xmm29 + vpaddd ONE(%rip),%xmm2,%xmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm0,%xmm2 + vpshufb %xmm29,%xmm0,%xmm0 + vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %xmm15,%xmm0,%xmm0 + vpxorq %xmm6,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm6,%xmm6 + vextracti32x4 $0,%zmm6,%xmm13 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_643 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_643 +.L_small_initial_partial_block_643: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + + + + + + + + + + + + vpxorq %xmm13,%xmm14,%xmm14 + + jmp .L_after_reduction_643 +.L_small_initial_compute_done_643: +.L_after_reduction_643: + jmp .L_small_initial_blocks_encrypted_642 +.L_small_initial_num_blocks_is_2_642: + vmovdqa64 SHUF_MASK(%rip),%ymm29 + vshufi64x2 $0,%ymm2,%ymm2,%ymm0 + vpaddd ddq_add_1234(%rip),%ymm0,%ymm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm0,%xmm2 + vpshufb %ymm29,%ymm0,%ymm0 + vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %ymm15,%ymm0,%ymm0 + vpxorq %ymm6,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm6,%ymm6 + vextracti32x4 $1,%zmm6,%xmm13 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_644 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_644 +.L_small_initial_partial_block_644: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_644: + + orq %r8,%r8 + je .L_after_reduction_644 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_644: + jmp .L_small_initial_blocks_encrypted_642 +.L_small_initial_num_blocks_is_3_642: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vextracti32x4 $2,%zmm6,%xmm13 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_645 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_645 +.L_small_initial_partial_block_645: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_645: + + orq %r8,%r8 + je .L_after_reduction_645 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_645: + jmp .L_small_initial_blocks_encrypted_642 +.L_small_initial_num_blocks_is_4_642: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vextracti32x4 $3,%zmm6,%xmm13 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_646 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_646 +.L_small_initial_partial_block_646: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_646: + + orq %r8,%r8 + je .L_after_reduction_646 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_646: + jmp .L_small_initial_blocks_encrypted_642 +.L_small_initial_num_blocks_is_5_642: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %xmm15,%xmm3,%xmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %xmm7,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %xmm29,%xmm7,%xmm7 + vextracti32x4 $0,%zmm7,%xmm13 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_647 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_647 +.L_small_initial_partial_block_647: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_647: + + orq %r8,%r8 + je .L_after_reduction_647 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_647: + jmp .L_small_initial_blocks_encrypted_642 +.L_small_initial_num_blocks_is_6_642: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %ymm15,%ymm3,%ymm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %ymm7,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %ymm29,%ymm7,%ymm7 + vextracti32x4 $1,%zmm7,%xmm13 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_648 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_648 +.L_small_initial_partial_block_648: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_648: + + orq %r8,%r8 + je .L_after_reduction_648 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_648: + jmp .L_small_initial_blocks_encrypted_642 +.L_small_initial_num_blocks_is_7_642: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vextracti32x4 $2,%zmm7,%xmm13 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_649 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_649 +.L_small_initial_partial_block_649: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_649: + + orq %r8,%r8 + je .L_after_reduction_649 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_649: + jmp .L_small_initial_blocks_encrypted_642 +.L_small_initial_num_blocks_is_8_642: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vextracti32x4 $3,%zmm7,%xmm13 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_650 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_650 +.L_small_initial_partial_block_650: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_650: + + orq %r8,%r8 + je .L_after_reduction_650 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_650: + jmp .L_small_initial_blocks_encrypted_642 +.L_small_initial_num_blocks_is_9_642: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %xmm15,%xmm4,%xmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %xmm10,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %xmm29,%xmm10,%xmm10 + vextracti32x4 $0,%zmm10,%xmm13 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_651 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_651 +.L_small_initial_partial_block_651: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_651: + + orq %r8,%r8 + je .L_after_reduction_651 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_651: + jmp .L_small_initial_blocks_encrypted_642 +.L_small_initial_num_blocks_is_10_642: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %ymm15,%ymm4,%ymm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %ymm10,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %ymm29,%ymm10,%ymm10 + vextracti32x4 $1,%zmm10,%xmm13 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_652 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_652 +.L_small_initial_partial_block_652: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_652: + + orq %r8,%r8 + je .L_after_reduction_652 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_652: + jmp .L_small_initial_blocks_encrypted_642 +.L_small_initial_num_blocks_is_11_642: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vextracti32x4 $2,%zmm10,%xmm13 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_653 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_653 +.L_small_initial_partial_block_653: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_653: + + orq %r8,%r8 + je .L_after_reduction_653 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_653: + jmp .L_small_initial_blocks_encrypted_642 +.L_small_initial_num_blocks_is_12_642: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vextracti32x4 $3,%zmm10,%xmm13 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_654 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_654 +.L_small_initial_partial_block_654: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_654: + + orq %r8,%r8 + je .L_after_reduction_654 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_654: + jmp .L_small_initial_blocks_encrypted_642 +.L_small_initial_num_blocks_is_13_642: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %xmm15,%xmm5,%xmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %xmm11,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %xmm29,%xmm11,%xmm11 + vextracti32x4 $0,%zmm11,%xmm13 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_655 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_655 +.L_small_initial_partial_block_655: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_655: + + orq %r8,%r8 + je .L_after_reduction_655 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_655: + jmp .L_small_initial_blocks_encrypted_642 +.L_small_initial_num_blocks_is_14_642: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %ymm15,%ymm5,%ymm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %ymm11,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %ymm29,%ymm11,%ymm11 + vextracti32x4 $1,%zmm11,%xmm13 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_656 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_656 +.L_small_initial_partial_block_656: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_656: + + orq %r8,%r8 + je .L_after_reduction_656 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_656: + jmp .L_small_initial_blocks_encrypted_642 +.L_small_initial_num_blocks_is_15_642: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vextracti32x4 $2,%zmm11,%xmm13 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_657 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_657 +.L_small_initial_partial_block_657: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_657: + + orq %r8,%r8 + je .L_after_reduction_657 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_657: + jmp .L_small_initial_blocks_encrypted_642 +.L_small_initial_num_blocks_is_16_642: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vextracti32x4 $3,%zmm11,%xmm13 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_658: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_658: + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_658: +.L_small_initial_blocks_encrypted_642: +.L_ghash_done_497: + vmovdqu64 %xmm2,0(%rsi) + vmovdqu64 %xmm14,64(%rsi) +.L_enc_dec_done_497: + jmp .Lexit_gcm_decrypt +.align 32 +.Laes_gcm_decrypt_192_avx512: + orq %r8,%r8 + je .L_enc_dec_done_659 + xorq %r14,%r14 + vmovdqu64 64(%rsi),%xmm14 + + movq (%rdx),%r11 + orq %r11,%r11 + je .L_partial_block_done_660 + movl $16,%r10d + leaq byte_len_to_mask_table(%rip),%r12 + cmpq %r10,%r8 + cmovcq %r8,%r10 + kmovw (%r12,%r10,2),%k1 + vmovdqu8 (%rcx),%xmm0{%k1}{z} + + vmovdqu64 16(%rsi),%xmm3 + vmovdqu64 336(%rsi),%xmm4 + + + + leaq SHIFT_MASK(%rip),%r12 + addq %r11,%r12 + vmovdqu64 (%r12),%xmm5 + vpshufb %xmm5,%xmm3,%xmm3 + + vmovdqa64 %xmm0,%xmm6 + vpxorq %xmm0,%xmm3,%xmm3 + + + leaq (%r8,%r11,1),%r13 + subq $16,%r13 + jge .L_no_extra_mask_660 + subq %r13,%r12 +.L_no_extra_mask_660: + + + + vmovdqu64 16(%r12),%xmm0 + vpand %xmm0,%xmm3,%xmm3 + vpand %xmm0,%xmm6,%xmm6 + vpshufb SHUF_MASK(%rip),%xmm6,%xmm6 + vpshufb %xmm5,%xmm6,%xmm6 + vpxorq %xmm6,%xmm14,%xmm14 + cmpq $0,%r13 + jl .L_partial_incomplete_660 + + vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7 + vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10 + vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11 + vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14 + vpxorq %xmm11,%xmm14,%xmm14 + + vpsrldq $8,%xmm14,%xmm11 + vpslldq $8,%xmm14,%xmm14 + vpxorq %xmm11,%xmm7,%xmm7 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vmovdqu64 POLY2(%rip),%xmm11 + + vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10 + vpslldq $8,%xmm10,%xmm10 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10 + vpsrldq $4,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14 + vpslldq $4,%xmm14,%xmm14 + + vpternlogq $0x96,%xmm10,%xmm7,%xmm14 + + movq $0,(%rdx) + + movq %r11,%r12 + movq $16,%r11 + subq %r12,%r11 + jmp .L_enc_dec_done_660 + +.L_partial_incomplete_660: + addq %r8,(%rdx) + movq %r8,%r11 + +.L_enc_dec_done_660: + + + leaq byte_len_to_mask_table(%rip),%r12 + kmovw (%r12,%r11,2),%k1 + vmovdqu64 %xmm14,64(%rsi) + movq %r9,%r12 + vmovdqu8 %xmm3,(%r12){%k1} +.L_partial_block_done_660: + vmovdqu64 0(%rsi),%xmm2 + subq %r11,%r8 + je .L_enc_dec_done_659 + cmpq $256,%r8 + jbe .L_message_below_equal_16_blocks_659 + + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vmovdqa64 ddq_addbe_4444(%rip),%zmm27 + vmovdqa64 ddq_addbe_1234(%rip),%zmm28 + + + + + + + vmovd %xmm2,%r15d + andl $255,%r15d + + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpshufb %zmm29,%zmm2,%zmm2 + + + + cmpb $240,%r15b + jae .L_next_16_overflow_661 + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_661 +.L_next_16_overflow_661: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_661: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 0(%rcx,%r11,1),%zmm0 + vmovdqu8 64(%rcx,%r11,1),%zmm3 + vmovdqu8 128(%rcx,%r11,1),%zmm4 + vmovdqu8 192(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 176(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 192(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,0(%r10,%r11,1) + vmovdqu8 %zmm10,64(%r10,%r11,1) + vmovdqu8 %zmm11,128(%r10,%r11,1) + vmovdqu8 %zmm12,192(%r10,%r11,1) + + vpshufb %zmm29,%zmm0,%zmm7 + vpshufb %zmm29,%zmm3,%zmm10 + vpshufb %zmm29,%zmm4,%zmm11 + vpshufb %zmm29,%zmm5,%zmm12 + vmovdqa64 %zmm7,768(%rsp) + vmovdqa64 %zmm10,832(%rsp) + vmovdqa64 %zmm11,896(%rsp) + vmovdqa64 %zmm12,960(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_662 + + vmovdqu64 288(%rsi),%zmm0 + vmovdqu64 %zmm0,704(%rsp) + + vmovdqu64 224(%rsi),%zmm3 + vmovdqu64 %zmm3,640(%rsp) + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 160(%rsi),%zmm4 + vmovdqu64 %zmm4,576(%rsp) + + vmovdqu64 96(%rsi),%zmm5 + vmovdqu64 %zmm5,512(%rsp) +.L_skip_hkeys_precomputation_662: + cmpq $512,%r8 + jb .L_message_below_32_blocks_659 + + + + cmpb $240,%r15b + jae .L_next_16_overflow_663 + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_663 +.L_next_16_overflow_663: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_663: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 256(%rcx,%r11,1),%zmm0 + vmovdqu8 320(%rcx,%r11,1),%zmm3 + vmovdqu8 384(%rcx,%r11,1),%zmm4 + vmovdqu8 448(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 176(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 192(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,256(%r10,%r11,1) + vmovdqu8 %zmm10,320(%r10,%r11,1) + vmovdqu8 %zmm11,384(%r10,%r11,1) + vmovdqu8 %zmm12,448(%r10,%r11,1) + + vpshufb %zmm29,%zmm0,%zmm7 + vpshufb %zmm29,%zmm3,%zmm10 + vpshufb %zmm29,%zmm4,%zmm11 + vpshufb %zmm29,%zmm5,%zmm12 + vmovdqa64 %zmm7,1024(%rsp) + vmovdqa64 %zmm10,1088(%rsp) + vmovdqa64 %zmm11,1152(%rsp) + vmovdqa64 %zmm12,1216(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_664 + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,192(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,128(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,64(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,0(%rsp) +.L_skip_hkeys_precomputation_664: + movq $1,%r14 + addq $512,%r11 + subq $512,%r8 + + cmpq $768,%r8 + jb .L_no_more_big_nblocks_659 +.L_encrypt_big_nblocks_659: + cmpb $240,%r15b + jae .L_16_blocks_overflow_665 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_665 +.L_16_blocks_overflow_665: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_665: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_666 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_666 +.L_16_blocks_overflow_666: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_666: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_667 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_667 +.L_16_blocks_overflow_667: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_667: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 512(%rcx,%r11,1),%zmm17 + vmovdqu8 576(%rcx,%r11,1),%zmm19 + vmovdqu8 640(%rcx,%r11,1),%zmm20 + vmovdqu8 704(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + + + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpternlogq $0x96,%zmm15,%zmm12,%zmm6 + vpxorq %zmm24,%zmm6,%zmm6 + vpternlogq $0x96,%zmm10,%zmm13,%zmm7 + vpxorq %zmm25,%zmm7,%zmm7 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vextracti64x4 $1,%zmm6,%ymm12 + vpxorq %ymm12,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm12 + vpxorq %xmm12,%xmm6,%xmm6 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm6 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,512(%r10,%r11,1) + vmovdqu8 %zmm3,576(%r10,%r11,1) + vmovdqu8 %zmm4,640(%r10,%r11,1) + vmovdqu8 %zmm5,704(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1024(%rsp) + vmovdqa64 %zmm3,1088(%rsp) + vmovdqa64 %zmm4,1152(%rsp) + vmovdqa64 %zmm5,1216(%rsp) + vmovdqa64 %zmm6,%zmm14 + + addq $768,%r11 + subq $768,%r8 + cmpq $768,%r8 + jae .L_encrypt_big_nblocks_659 + +.L_no_more_big_nblocks_659: + + cmpq $512,%r8 + jae .L_encrypt_32_blocks_659 + + cmpq $256,%r8 + jae .L_encrypt_16_blocks_659 +.L_encrypt_0_blocks_ghash_32_659: + movl %r8d,%r10d + andl $~15,%r10d + movl $256,%ebx + subl %r10d,%ebx + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + addl $256,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_668 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_668 + jb .L_last_num_blocks_is_7_1_668 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_668 + jb .L_last_num_blocks_is_11_9_668 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_668 + ja .L_last_num_blocks_is_16_668 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_668 + jmp .L_last_num_blocks_is_13_668 + +.L_last_num_blocks_is_11_9_668: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_668 + ja .L_last_num_blocks_is_11_668 + jmp .L_last_num_blocks_is_9_668 + +.L_last_num_blocks_is_7_1_668: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_668 + jb .L_last_num_blocks_is_3_1_668 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_668 + je .L_last_num_blocks_is_6_668 + jmp .L_last_num_blocks_is_5_668 + +.L_last_num_blocks_is_3_1_668: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_668 + je .L_last_num_blocks_is_2_668 +.L_last_num_blocks_is_1_668: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_669 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_669 + +.L_16_blocks_overflow_669: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_669: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_670 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_670 +.L_small_initial_partial_block_670: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_670 +.L_small_initial_compute_done_670: +.L_after_reduction_670: + jmp .L_last_blocks_done_668 +.L_last_num_blocks_is_2_668: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_671 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_671 + +.L_16_blocks_overflow_671: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_671: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_672 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_672 +.L_small_initial_partial_block_672: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_672: + + orq %r8,%r8 + je .L_after_reduction_672 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_672: + jmp .L_last_blocks_done_668 +.L_last_num_blocks_is_3_668: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_673 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_673 + +.L_16_blocks_overflow_673: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_673: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_674 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_674 +.L_small_initial_partial_block_674: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_674: + + orq %r8,%r8 + je .L_after_reduction_674 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_674: + jmp .L_last_blocks_done_668 +.L_last_num_blocks_is_4_668: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_675 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_675 + +.L_16_blocks_overflow_675: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_675: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_676 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_676 +.L_small_initial_partial_block_676: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_676: + + orq %r8,%r8 + je .L_after_reduction_676 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_676: + jmp .L_last_blocks_done_668 +.L_last_num_blocks_is_5_668: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_677 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_677 + +.L_16_blocks_overflow_677: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_677: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_678 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_678 +.L_small_initial_partial_block_678: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_678: + + orq %r8,%r8 + je .L_after_reduction_678 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_678: + jmp .L_last_blocks_done_668 +.L_last_num_blocks_is_6_668: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_679 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_679 + +.L_16_blocks_overflow_679: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_679: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_680 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_680 +.L_small_initial_partial_block_680: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_680: + + orq %r8,%r8 + je .L_after_reduction_680 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_680: + jmp .L_last_blocks_done_668 +.L_last_num_blocks_is_7_668: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_681 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_681 + +.L_16_blocks_overflow_681: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_681: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_682 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_682 +.L_small_initial_partial_block_682: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_682: + + orq %r8,%r8 + je .L_after_reduction_682 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_682: + jmp .L_last_blocks_done_668 +.L_last_num_blocks_is_8_668: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_683 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_683 + +.L_16_blocks_overflow_683: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_683: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_684 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_684 +.L_small_initial_partial_block_684: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_684: + + orq %r8,%r8 + je .L_after_reduction_684 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_684: + jmp .L_last_blocks_done_668 +.L_last_num_blocks_is_9_668: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_685 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_685 + +.L_16_blocks_overflow_685: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_685: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_686 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_686 +.L_small_initial_partial_block_686: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_686: + + orq %r8,%r8 + je .L_after_reduction_686 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_686: + jmp .L_last_blocks_done_668 +.L_last_num_blocks_is_10_668: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_687 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_687 + +.L_16_blocks_overflow_687: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_687: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_688 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_688 +.L_small_initial_partial_block_688: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_688: + + orq %r8,%r8 + je .L_after_reduction_688 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_688: + jmp .L_last_blocks_done_668 +.L_last_num_blocks_is_11_668: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_689 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_689 + +.L_16_blocks_overflow_689: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_689: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_690 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_690 +.L_small_initial_partial_block_690: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_690: + + orq %r8,%r8 + je .L_after_reduction_690 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_690: + jmp .L_last_blocks_done_668 +.L_last_num_blocks_is_12_668: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_691 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_691 + +.L_16_blocks_overflow_691: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_691: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_692 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_692 +.L_small_initial_partial_block_692: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_692: + + orq %r8,%r8 + je .L_after_reduction_692 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_692: + jmp .L_last_blocks_done_668 +.L_last_num_blocks_is_13_668: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_693 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_693 + +.L_16_blocks_overflow_693: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_693: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_694 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_694 +.L_small_initial_partial_block_694: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_694: + + orq %r8,%r8 + je .L_after_reduction_694 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_694: + jmp .L_last_blocks_done_668 +.L_last_num_blocks_is_14_668: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_695 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_695 + +.L_16_blocks_overflow_695: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_695: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_696 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_696 +.L_small_initial_partial_block_696: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_696: + + orq %r8,%r8 + je .L_after_reduction_696 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_696: + jmp .L_last_blocks_done_668 +.L_last_num_blocks_is_15_668: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_697 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_697 + +.L_16_blocks_overflow_697: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_697: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_698 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_698 +.L_small_initial_partial_block_698: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_698: + + orq %r8,%r8 + je .L_after_reduction_698 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_698: + jmp .L_last_blocks_done_668 +.L_last_num_blocks_is_16_668: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_699 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_699 + +.L_16_blocks_overflow_699: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_699: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_700: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_700: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_700: + jmp .L_last_blocks_done_668 +.L_last_num_blocks_is_0_668: + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_668: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_659 +.L_encrypt_32_blocks_659: + cmpb $240,%r15b + jae .L_16_blocks_overflow_701 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_701 +.L_16_blocks_overflow_701: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_701: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_702 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_702 +.L_16_blocks_overflow_702: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_702: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + + subq $512,%r8 + addq $512,%r11 + movl %r8d,%r10d + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_703 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_703 + jb .L_last_num_blocks_is_7_1_703 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_703 + jb .L_last_num_blocks_is_11_9_703 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_703 + ja .L_last_num_blocks_is_16_703 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_703 + jmp .L_last_num_blocks_is_13_703 + +.L_last_num_blocks_is_11_9_703: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_703 + ja .L_last_num_blocks_is_11_703 + jmp .L_last_num_blocks_is_9_703 + +.L_last_num_blocks_is_7_1_703: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_703 + jb .L_last_num_blocks_is_3_1_703 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_703 + je .L_last_num_blocks_is_6_703 + jmp .L_last_num_blocks_is_5_703 + +.L_last_num_blocks_is_3_1_703: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_703 + je .L_last_num_blocks_is_2_703 +.L_last_num_blocks_is_1_703: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_704 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_704 + +.L_16_blocks_overflow_704: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_704: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_705 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_705 +.L_small_initial_partial_block_705: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_705 +.L_small_initial_compute_done_705: +.L_after_reduction_705: + jmp .L_last_blocks_done_703 +.L_last_num_blocks_is_2_703: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_706 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_706 + +.L_16_blocks_overflow_706: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_706: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_707 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_707 +.L_small_initial_partial_block_707: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_707: + + orq %r8,%r8 + je .L_after_reduction_707 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_707: + jmp .L_last_blocks_done_703 +.L_last_num_blocks_is_3_703: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_708 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_708 + +.L_16_blocks_overflow_708: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_708: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_709 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_709 +.L_small_initial_partial_block_709: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_709: + + orq %r8,%r8 + je .L_after_reduction_709 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_709: + jmp .L_last_blocks_done_703 +.L_last_num_blocks_is_4_703: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_710 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_710 + +.L_16_blocks_overflow_710: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_710: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_711 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_711 +.L_small_initial_partial_block_711: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_711: + + orq %r8,%r8 + je .L_after_reduction_711 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_711: + jmp .L_last_blocks_done_703 +.L_last_num_blocks_is_5_703: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_712 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_712 + +.L_16_blocks_overflow_712: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_712: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_713 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_713 +.L_small_initial_partial_block_713: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_713: + + orq %r8,%r8 + je .L_after_reduction_713 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_713: + jmp .L_last_blocks_done_703 +.L_last_num_blocks_is_6_703: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_714 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_714 + +.L_16_blocks_overflow_714: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_714: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_715 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_715 +.L_small_initial_partial_block_715: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_715: + + orq %r8,%r8 + je .L_after_reduction_715 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_715: + jmp .L_last_blocks_done_703 +.L_last_num_blocks_is_7_703: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_716 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_716 + +.L_16_blocks_overflow_716: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_716: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_717 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_717 +.L_small_initial_partial_block_717: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_717: + + orq %r8,%r8 + je .L_after_reduction_717 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_717: + jmp .L_last_blocks_done_703 +.L_last_num_blocks_is_8_703: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_718 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_718 + +.L_16_blocks_overflow_718: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_718: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_719 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_719 +.L_small_initial_partial_block_719: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_719: + + orq %r8,%r8 + je .L_after_reduction_719 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_719: + jmp .L_last_blocks_done_703 +.L_last_num_blocks_is_9_703: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_720 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_720 + +.L_16_blocks_overflow_720: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_720: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_721 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_721 +.L_small_initial_partial_block_721: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_721: + + orq %r8,%r8 + je .L_after_reduction_721 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_721: + jmp .L_last_blocks_done_703 +.L_last_num_blocks_is_10_703: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_722 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_722 + +.L_16_blocks_overflow_722: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_722: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_723 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_723 +.L_small_initial_partial_block_723: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_723: + + orq %r8,%r8 + je .L_after_reduction_723 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_723: + jmp .L_last_blocks_done_703 +.L_last_num_blocks_is_11_703: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_724 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_724 + +.L_16_blocks_overflow_724: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_724: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_725 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_725 +.L_small_initial_partial_block_725: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_725: + + orq %r8,%r8 + je .L_after_reduction_725 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_725: + jmp .L_last_blocks_done_703 +.L_last_num_blocks_is_12_703: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_726 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_726 + +.L_16_blocks_overflow_726: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_726: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_727 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_727 +.L_small_initial_partial_block_727: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_727: + + orq %r8,%r8 + je .L_after_reduction_727 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_727: + jmp .L_last_blocks_done_703 +.L_last_num_blocks_is_13_703: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_728 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_728 + +.L_16_blocks_overflow_728: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_728: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_729 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_729 +.L_small_initial_partial_block_729: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_729: + + orq %r8,%r8 + je .L_after_reduction_729 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_729: + jmp .L_last_blocks_done_703 +.L_last_num_blocks_is_14_703: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_730 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_730 + +.L_16_blocks_overflow_730: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_730: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_731 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_731 +.L_small_initial_partial_block_731: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_731: + + orq %r8,%r8 + je .L_after_reduction_731 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_731: + jmp .L_last_blocks_done_703 +.L_last_num_blocks_is_15_703: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_732 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_732 + +.L_16_blocks_overflow_732: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_732: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_733 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_733 +.L_small_initial_partial_block_733: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_733: + + orq %r8,%r8 + je .L_after_reduction_733 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_733: + jmp .L_last_blocks_done_703 +.L_last_num_blocks_is_16_703: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_734 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_734 + +.L_16_blocks_overflow_734: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_734: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_735: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_735: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_735: + jmp .L_last_blocks_done_703 +.L_last_num_blocks_is_0_703: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_703: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_659 +.L_encrypt_16_blocks_659: + cmpb $240,%r15b + jae .L_16_blocks_overflow_736 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_736 +.L_16_blocks_overflow_736: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_736: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 256(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 320(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 384(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 448(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_737 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_737 + jb .L_last_num_blocks_is_7_1_737 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_737 + jb .L_last_num_blocks_is_11_9_737 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_737 + ja .L_last_num_blocks_is_16_737 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_737 + jmp .L_last_num_blocks_is_13_737 + +.L_last_num_blocks_is_11_9_737: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_737 + ja .L_last_num_blocks_is_11_737 + jmp .L_last_num_blocks_is_9_737 + +.L_last_num_blocks_is_7_1_737: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_737 + jb .L_last_num_blocks_is_3_1_737 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_737 + je .L_last_num_blocks_is_6_737 + jmp .L_last_num_blocks_is_5_737 + +.L_last_num_blocks_is_3_1_737: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_737 + je .L_last_num_blocks_is_2_737 +.L_last_num_blocks_is_1_737: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_738 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_738 + +.L_16_blocks_overflow_738: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_738: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %xmm31,%xmm0,%xmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_739 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_739 +.L_small_initial_partial_block_739: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_739 +.L_small_initial_compute_done_739: +.L_after_reduction_739: + jmp .L_last_blocks_done_737 +.L_last_num_blocks_is_2_737: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_740 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_740 + +.L_16_blocks_overflow_740: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_740: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %ymm31,%ymm0,%ymm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_741 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_741 +.L_small_initial_partial_block_741: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_741: + + orq %r8,%r8 + je .L_after_reduction_741 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_741: + jmp .L_last_blocks_done_737 +.L_last_num_blocks_is_3_737: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_742 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_742 + +.L_16_blocks_overflow_742: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_742: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_743 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_743 +.L_small_initial_partial_block_743: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_743: + + orq %r8,%r8 + je .L_after_reduction_743 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_743: + jmp .L_last_blocks_done_737 +.L_last_num_blocks_is_4_737: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_744 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_744 + +.L_16_blocks_overflow_744: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_744: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_745 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_745 +.L_small_initial_partial_block_745: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_745: + + orq %r8,%r8 + je .L_after_reduction_745 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_745: + jmp .L_last_blocks_done_737 +.L_last_num_blocks_is_5_737: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_746 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_746 + +.L_16_blocks_overflow_746: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_746: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_747 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_747 +.L_small_initial_partial_block_747: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_747: + + orq %r8,%r8 + je .L_after_reduction_747 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_747: + jmp .L_last_blocks_done_737 +.L_last_num_blocks_is_6_737: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_748 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_748 + +.L_16_blocks_overflow_748: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_748: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_749 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_749 +.L_small_initial_partial_block_749: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_749: + + orq %r8,%r8 + je .L_after_reduction_749 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_749: + jmp .L_last_blocks_done_737 +.L_last_num_blocks_is_7_737: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_750 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_750 + +.L_16_blocks_overflow_750: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_750: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_751 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_751 +.L_small_initial_partial_block_751: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_751: + + orq %r8,%r8 + je .L_after_reduction_751 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_751: + jmp .L_last_blocks_done_737 +.L_last_num_blocks_is_8_737: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_752 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_752 + +.L_16_blocks_overflow_752: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_752: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_753 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_753 +.L_small_initial_partial_block_753: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_753: + + orq %r8,%r8 + je .L_after_reduction_753 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_753: + jmp .L_last_blocks_done_737 +.L_last_num_blocks_is_9_737: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_754 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_754 + +.L_16_blocks_overflow_754: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_754: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_755 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_755 +.L_small_initial_partial_block_755: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_755: + + orq %r8,%r8 + je .L_after_reduction_755 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_755: + jmp .L_last_blocks_done_737 +.L_last_num_blocks_is_10_737: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_756 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_756 + +.L_16_blocks_overflow_756: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_756: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_757 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_757 +.L_small_initial_partial_block_757: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_757: + + orq %r8,%r8 + je .L_after_reduction_757 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_757: + jmp .L_last_blocks_done_737 +.L_last_num_blocks_is_11_737: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_758 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_758 + +.L_16_blocks_overflow_758: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_758: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_759 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_759 +.L_small_initial_partial_block_759: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_759: + + orq %r8,%r8 + je .L_after_reduction_759 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_759: + jmp .L_last_blocks_done_737 +.L_last_num_blocks_is_12_737: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_760 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_760 + +.L_16_blocks_overflow_760: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_760: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_761 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_761 +.L_small_initial_partial_block_761: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_761: + + orq %r8,%r8 + je .L_after_reduction_761 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_761: + jmp .L_last_blocks_done_737 +.L_last_num_blocks_is_13_737: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_762 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_762 + +.L_16_blocks_overflow_762: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_762: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_763 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_763 +.L_small_initial_partial_block_763: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_763: + + orq %r8,%r8 + je .L_after_reduction_763 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_763: + jmp .L_last_blocks_done_737 +.L_last_num_blocks_is_14_737: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_764 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_764 + +.L_16_blocks_overflow_764: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_764: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_765 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_765 +.L_small_initial_partial_block_765: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_765: + + orq %r8,%r8 + je .L_after_reduction_765 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_765: + jmp .L_last_blocks_done_737 +.L_last_num_blocks_is_15_737: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_766 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_766 + +.L_16_blocks_overflow_766: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_766: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_767 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_767 +.L_small_initial_partial_block_767: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_767: + + orq %r8,%r8 + je .L_after_reduction_767 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_767: + jmp .L_last_blocks_done_737 +.L_last_num_blocks_is_16_737: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_768 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_768 + +.L_16_blocks_overflow_768: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_768: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_769: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_769: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_769: + jmp .L_last_blocks_done_737 +.L_last_num_blocks_is_0_737: + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_737: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_659 + +.L_message_below_32_blocks_659: + + + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_770 + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) +.L_skip_hkeys_precomputation_770: + movq $1,%r14 + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_771 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_771 + jb .L_last_num_blocks_is_7_1_771 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_771 + jb .L_last_num_blocks_is_11_9_771 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_771 + ja .L_last_num_blocks_is_16_771 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_771 + jmp .L_last_num_blocks_is_13_771 + +.L_last_num_blocks_is_11_9_771: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_771 + ja .L_last_num_blocks_is_11_771 + jmp .L_last_num_blocks_is_9_771 + +.L_last_num_blocks_is_7_1_771: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_771 + jb .L_last_num_blocks_is_3_1_771 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_771 + je .L_last_num_blocks_is_6_771 + jmp .L_last_num_blocks_is_5_771 + +.L_last_num_blocks_is_3_1_771: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_771 + je .L_last_num_blocks_is_2_771 +.L_last_num_blocks_is_1_771: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_772 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_772 + +.L_16_blocks_overflow_772: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_772: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_773 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_773 +.L_small_initial_partial_block_773: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_773 +.L_small_initial_compute_done_773: +.L_after_reduction_773: + jmp .L_last_blocks_done_771 +.L_last_num_blocks_is_2_771: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_774 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_774 + +.L_16_blocks_overflow_774: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_774: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_775 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_775 +.L_small_initial_partial_block_775: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_775: + + orq %r8,%r8 + je .L_after_reduction_775 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_775: + jmp .L_last_blocks_done_771 +.L_last_num_blocks_is_3_771: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_776 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_776 + +.L_16_blocks_overflow_776: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_776: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_777 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_777 +.L_small_initial_partial_block_777: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_777: + + orq %r8,%r8 + je .L_after_reduction_777 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_777: + jmp .L_last_blocks_done_771 +.L_last_num_blocks_is_4_771: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_778 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_778 + +.L_16_blocks_overflow_778: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_778: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_779 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_779 +.L_small_initial_partial_block_779: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_779: + + orq %r8,%r8 + je .L_after_reduction_779 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_779: + jmp .L_last_blocks_done_771 +.L_last_num_blocks_is_5_771: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_780 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_780 + +.L_16_blocks_overflow_780: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_780: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_781 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_781 +.L_small_initial_partial_block_781: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_781: + + orq %r8,%r8 + je .L_after_reduction_781 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_781: + jmp .L_last_blocks_done_771 +.L_last_num_blocks_is_6_771: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_782 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_782 + +.L_16_blocks_overflow_782: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_782: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_783 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_783 +.L_small_initial_partial_block_783: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_783: + + orq %r8,%r8 + je .L_after_reduction_783 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_783: + jmp .L_last_blocks_done_771 +.L_last_num_blocks_is_7_771: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_784 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_784 + +.L_16_blocks_overflow_784: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_784: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_785 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_785 +.L_small_initial_partial_block_785: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_785: + + orq %r8,%r8 + je .L_after_reduction_785 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_785: + jmp .L_last_blocks_done_771 +.L_last_num_blocks_is_8_771: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_786 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_786 + +.L_16_blocks_overflow_786: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_786: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_787 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_787 +.L_small_initial_partial_block_787: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_787: + + orq %r8,%r8 + je .L_after_reduction_787 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_787: + jmp .L_last_blocks_done_771 +.L_last_num_blocks_is_9_771: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_788 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_788 + +.L_16_blocks_overflow_788: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_788: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_789 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_789 +.L_small_initial_partial_block_789: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_789: + + orq %r8,%r8 + je .L_after_reduction_789 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_789: + jmp .L_last_blocks_done_771 +.L_last_num_blocks_is_10_771: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_790 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_790 + +.L_16_blocks_overflow_790: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_790: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_791 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_791 +.L_small_initial_partial_block_791: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_791: + + orq %r8,%r8 + je .L_after_reduction_791 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_791: + jmp .L_last_blocks_done_771 +.L_last_num_blocks_is_11_771: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_792 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_792 + +.L_16_blocks_overflow_792: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_792: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_793 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_793 +.L_small_initial_partial_block_793: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_793: + + orq %r8,%r8 + je .L_after_reduction_793 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_793: + jmp .L_last_blocks_done_771 +.L_last_num_blocks_is_12_771: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_794 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_794 + +.L_16_blocks_overflow_794: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_794: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_795 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_795 +.L_small_initial_partial_block_795: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_795: + + orq %r8,%r8 + je .L_after_reduction_795 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_795: + jmp .L_last_blocks_done_771 +.L_last_num_blocks_is_13_771: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_796 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_796 + +.L_16_blocks_overflow_796: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_796: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_797 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_797 +.L_small_initial_partial_block_797: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_797: + + orq %r8,%r8 + je .L_after_reduction_797 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_797: + jmp .L_last_blocks_done_771 +.L_last_num_blocks_is_14_771: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_798 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_798 + +.L_16_blocks_overflow_798: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_798: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_799 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_799 +.L_small_initial_partial_block_799: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_799: + + orq %r8,%r8 + je .L_after_reduction_799 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_799: + jmp .L_last_blocks_done_771 +.L_last_num_blocks_is_15_771: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_800 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_800 + +.L_16_blocks_overflow_800: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_800: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_801 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_801 +.L_small_initial_partial_block_801: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_801: + + orq %r8,%r8 + je .L_after_reduction_801 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_801: + jmp .L_last_blocks_done_771 +.L_last_num_blocks_is_16_771: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_802 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_802 + +.L_16_blocks_overflow_802: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_802: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_803: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_803: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_803: + jmp .L_last_blocks_done_771 +.L_last_num_blocks_is_0_771: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_771: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_659 + +.L_message_below_equal_16_blocks_659: + + + movl %r8d,%r12d + addl $15,%r12d + shrl $4,%r12d + cmpq $8,%r12 + je .L_small_initial_num_blocks_is_8_804 + jl .L_small_initial_num_blocks_is_7_1_804 + + + cmpq $12,%r12 + je .L_small_initial_num_blocks_is_12_804 + jl .L_small_initial_num_blocks_is_11_9_804 + + + cmpq $16,%r12 + je .L_small_initial_num_blocks_is_16_804 + cmpq $15,%r12 + je .L_small_initial_num_blocks_is_15_804 + cmpq $14,%r12 + je .L_small_initial_num_blocks_is_14_804 + jmp .L_small_initial_num_blocks_is_13_804 + +.L_small_initial_num_blocks_is_11_9_804: + + cmpq $11,%r12 + je .L_small_initial_num_blocks_is_11_804 + cmpq $10,%r12 + je .L_small_initial_num_blocks_is_10_804 + jmp .L_small_initial_num_blocks_is_9_804 + +.L_small_initial_num_blocks_is_7_1_804: + cmpq $4,%r12 + je .L_small_initial_num_blocks_is_4_804 + jl .L_small_initial_num_blocks_is_3_1_804 + + cmpq $7,%r12 + je .L_small_initial_num_blocks_is_7_804 + cmpq $6,%r12 + je .L_small_initial_num_blocks_is_6_804 + jmp .L_small_initial_num_blocks_is_5_804 + +.L_small_initial_num_blocks_is_3_1_804: + + cmpq $3,%r12 + je .L_small_initial_num_blocks_is_3_804 + cmpq $2,%r12 + je .L_small_initial_num_blocks_is_2_804 + + + + + +.L_small_initial_num_blocks_is_1_804: + vmovdqa64 SHUF_MASK(%rip),%xmm29 + vpaddd ONE(%rip),%xmm2,%xmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm0,%xmm2 + vpshufb %xmm29,%xmm0,%xmm0 + vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %xmm15,%xmm0,%xmm0 + vpxorq %xmm6,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm6,%xmm6 + vextracti32x4 $0,%zmm6,%xmm13 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_805 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_805 +.L_small_initial_partial_block_805: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + + + + + + + + + + + + vpxorq %xmm13,%xmm14,%xmm14 + + jmp .L_after_reduction_805 +.L_small_initial_compute_done_805: +.L_after_reduction_805: + jmp .L_small_initial_blocks_encrypted_804 +.L_small_initial_num_blocks_is_2_804: + vmovdqa64 SHUF_MASK(%rip),%ymm29 + vshufi64x2 $0,%ymm2,%ymm2,%ymm0 + vpaddd ddq_add_1234(%rip),%ymm0,%ymm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm0,%xmm2 + vpshufb %ymm29,%ymm0,%ymm0 + vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %ymm15,%ymm0,%ymm0 + vpxorq %ymm6,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm6,%ymm6 + vextracti32x4 $1,%zmm6,%xmm13 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_806 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_806 +.L_small_initial_partial_block_806: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_806: + + orq %r8,%r8 + je .L_after_reduction_806 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_806: + jmp .L_small_initial_blocks_encrypted_804 +.L_small_initial_num_blocks_is_3_804: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vextracti32x4 $2,%zmm6,%xmm13 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_807 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_807 +.L_small_initial_partial_block_807: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_807: + + orq %r8,%r8 + je .L_after_reduction_807 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_807: + jmp .L_small_initial_blocks_encrypted_804 +.L_small_initial_num_blocks_is_4_804: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vextracti32x4 $3,%zmm6,%xmm13 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_808 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_808 +.L_small_initial_partial_block_808: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_808: + + orq %r8,%r8 + je .L_after_reduction_808 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_808: + jmp .L_small_initial_blocks_encrypted_804 +.L_small_initial_num_blocks_is_5_804: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %xmm15,%xmm3,%xmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %xmm7,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %xmm29,%xmm7,%xmm7 + vextracti32x4 $0,%zmm7,%xmm13 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_809 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_809 +.L_small_initial_partial_block_809: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_809: + + orq %r8,%r8 + je .L_after_reduction_809 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_809: + jmp .L_small_initial_blocks_encrypted_804 +.L_small_initial_num_blocks_is_6_804: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %ymm15,%ymm3,%ymm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %ymm7,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %ymm29,%ymm7,%ymm7 + vextracti32x4 $1,%zmm7,%xmm13 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_810 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_810 +.L_small_initial_partial_block_810: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_810: + + orq %r8,%r8 + je .L_after_reduction_810 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_810: + jmp .L_small_initial_blocks_encrypted_804 +.L_small_initial_num_blocks_is_7_804: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vextracti32x4 $2,%zmm7,%xmm13 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_811 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_811 +.L_small_initial_partial_block_811: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_811: + + orq %r8,%r8 + je .L_after_reduction_811 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_811: + jmp .L_small_initial_blocks_encrypted_804 +.L_small_initial_num_blocks_is_8_804: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vextracti32x4 $3,%zmm7,%xmm13 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_812 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_812 +.L_small_initial_partial_block_812: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_812: + + orq %r8,%r8 + je .L_after_reduction_812 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_812: + jmp .L_small_initial_blocks_encrypted_804 +.L_small_initial_num_blocks_is_9_804: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %xmm15,%xmm4,%xmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %xmm10,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %xmm29,%xmm10,%xmm10 + vextracti32x4 $0,%zmm10,%xmm13 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_813 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_813 +.L_small_initial_partial_block_813: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_813: + + orq %r8,%r8 + je .L_after_reduction_813 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_813: + jmp .L_small_initial_blocks_encrypted_804 +.L_small_initial_num_blocks_is_10_804: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %ymm15,%ymm4,%ymm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %ymm10,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %ymm29,%ymm10,%ymm10 + vextracti32x4 $1,%zmm10,%xmm13 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_814 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_814 +.L_small_initial_partial_block_814: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_814: + + orq %r8,%r8 + je .L_after_reduction_814 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_814: + jmp .L_small_initial_blocks_encrypted_804 +.L_small_initial_num_blocks_is_11_804: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vextracti32x4 $2,%zmm10,%xmm13 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_815 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_815 +.L_small_initial_partial_block_815: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_815: + + orq %r8,%r8 + je .L_after_reduction_815 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_815: + jmp .L_small_initial_blocks_encrypted_804 +.L_small_initial_num_blocks_is_12_804: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vextracti32x4 $3,%zmm10,%xmm13 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_816 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_816 +.L_small_initial_partial_block_816: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_816: + + orq %r8,%r8 + je .L_after_reduction_816 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_816: + jmp .L_small_initial_blocks_encrypted_804 +.L_small_initial_num_blocks_is_13_804: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %xmm15,%xmm5,%xmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %xmm11,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %xmm29,%xmm11,%xmm11 + vextracti32x4 $0,%zmm11,%xmm13 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_817 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_817 +.L_small_initial_partial_block_817: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_817: + + orq %r8,%r8 + je .L_after_reduction_817 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_817: + jmp .L_small_initial_blocks_encrypted_804 +.L_small_initial_num_blocks_is_14_804: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %ymm15,%ymm5,%ymm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %ymm11,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %ymm29,%ymm11,%ymm11 + vextracti32x4 $1,%zmm11,%xmm13 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_818 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_818 +.L_small_initial_partial_block_818: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_818: + + orq %r8,%r8 + je .L_after_reduction_818 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_818: + jmp .L_small_initial_blocks_encrypted_804 +.L_small_initial_num_blocks_is_15_804: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vextracti32x4 $2,%zmm11,%xmm13 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_819 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_819 +.L_small_initial_partial_block_819: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_819: + + orq %r8,%r8 + je .L_after_reduction_819 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_819: + jmp .L_small_initial_blocks_encrypted_804 +.L_small_initial_num_blocks_is_16_804: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vextracti32x4 $3,%zmm11,%xmm13 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_820: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_820: + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_820: +.L_small_initial_blocks_encrypted_804: +.L_ghash_done_659: + vmovdqu64 %xmm2,0(%rsi) + vmovdqu64 %xmm14,64(%rsi) +.L_enc_dec_done_659: + jmp .Lexit_gcm_decrypt +.align 32 +.Laes_gcm_decrypt_256_avx512: + orq %r8,%r8 + je .L_enc_dec_done_821 + xorq %r14,%r14 + vmovdqu64 64(%rsi),%xmm14 + + movq (%rdx),%r11 + orq %r11,%r11 + je .L_partial_block_done_822 + movl $16,%r10d + leaq byte_len_to_mask_table(%rip),%r12 + cmpq %r10,%r8 + cmovcq %r8,%r10 + kmovw (%r12,%r10,2),%k1 + vmovdqu8 (%rcx),%xmm0{%k1}{z} + + vmovdqu64 16(%rsi),%xmm3 + vmovdqu64 336(%rsi),%xmm4 + + + + leaq SHIFT_MASK(%rip),%r12 + addq %r11,%r12 + vmovdqu64 (%r12),%xmm5 + vpshufb %xmm5,%xmm3,%xmm3 + + vmovdqa64 %xmm0,%xmm6 + vpxorq %xmm0,%xmm3,%xmm3 + + + leaq (%r8,%r11,1),%r13 + subq $16,%r13 + jge .L_no_extra_mask_822 + subq %r13,%r12 +.L_no_extra_mask_822: + + + + vmovdqu64 16(%r12),%xmm0 + vpand %xmm0,%xmm3,%xmm3 + vpand %xmm0,%xmm6,%xmm6 + vpshufb SHUF_MASK(%rip),%xmm6,%xmm6 + vpshufb %xmm5,%xmm6,%xmm6 + vpxorq %xmm6,%xmm14,%xmm14 + cmpq $0,%r13 + jl .L_partial_incomplete_822 + + vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7 + vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10 + vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11 + vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14 + vpxorq %xmm11,%xmm14,%xmm14 + + vpsrldq $8,%xmm14,%xmm11 + vpslldq $8,%xmm14,%xmm14 + vpxorq %xmm11,%xmm7,%xmm7 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vmovdqu64 POLY2(%rip),%xmm11 + + vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10 + vpslldq $8,%xmm10,%xmm10 + vpxorq %xmm10,%xmm14,%xmm14 + + + + vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10 + vpsrldq $4,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14 + vpslldq $4,%xmm14,%xmm14 + + vpternlogq $0x96,%xmm10,%xmm7,%xmm14 + + movq $0,(%rdx) + + movq %r11,%r12 + movq $16,%r11 + subq %r12,%r11 + jmp .L_enc_dec_done_822 + +.L_partial_incomplete_822: + addq %r8,(%rdx) + movq %r8,%r11 + +.L_enc_dec_done_822: + + + leaq byte_len_to_mask_table(%rip),%r12 + kmovw (%r12,%r11,2),%k1 + vmovdqu64 %xmm14,64(%rsi) + movq %r9,%r12 + vmovdqu8 %xmm3,(%r12){%k1} +.L_partial_block_done_822: + vmovdqu64 0(%rsi),%xmm2 + subq %r11,%r8 + je .L_enc_dec_done_821 + cmpq $256,%r8 + jbe .L_message_below_equal_16_blocks_821 + + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vmovdqa64 ddq_addbe_4444(%rip),%zmm27 + vmovdqa64 ddq_addbe_1234(%rip),%zmm28 + + + + + + + vmovd %xmm2,%r15d + andl $255,%r15d + + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpshufb %zmm29,%zmm2,%zmm2 + + + + cmpb $240,%r15b + jae .L_next_16_overflow_823 + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_823 +.L_next_16_overflow_823: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_823: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 0(%rcx,%r11,1),%zmm0 + vmovdqu8 64(%rcx,%r11,1),%zmm3 + vmovdqu8 128(%rcx,%r11,1),%zmm4 + vmovdqu8 192(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 176(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 192(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 208(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 224(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,0(%r10,%r11,1) + vmovdqu8 %zmm10,64(%r10,%r11,1) + vmovdqu8 %zmm11,128(%r10,%r11,1) + vmovdqu8 %zmm12,192(%r10,%r11,1) + + vpshufb %zmm29,%zmm0,%zmm7 + vpshufb %zmm29,%zmm3,%zmm10 + vpshufb %zmm29,%zmm4,%zmm11 + vpshufb %zmm29,%zmm5,%zmm12 + vmovdqa64 %zmm7,768(%rsp) + vmovdqa64 %zmm10,832(%rsp) + vmovdqa64 %zmm11,896(%rsp) + vmovdqa64 %zmm12,960(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_824 + + vmovdqu64 288(%rsi),%zmm0 + vmovdqu64 %zmm0,704(%rsp) + + vmovdqu64 224(%rsi),%zmm3 + vmovdqu64 %zmm3,640(%rsp) + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 160(%rsi),%zmm4 + vmovdqu64 %zmm4,576(%rsp) + + vmovdqu64 96(%rsi),%zmm5 + vmovdqu64 %zmm5,512(%rsp) +.L_skip_hkeys_precomputation_824: + cmpq $512,%r8 + jb .L_message_below_32_blocks_821 + + + + cmpb $240,%r15b + jae .L_next_16_overflow_825 + vpaddd %zmm28,%zmm2,%zmm7 + vpaddd %zmm27,%zmm7,%zmm10 + vpaddd %zmm27,%zmm10,%zmm11 + vpaddd %zmm27,%zmm11,%zmm12 + jmp .L_next_16_ok_825 +.L_next_16_overflow_825: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm12 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm7 + vpaddd %zmm12,%zmm7,%zmm10 + vpaddd %zmm12,%zmm10,%zmm11 + vpaddd %zmm12,%zmm11,%zmm12 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vpshufb %zmm29,%zmm12,%zmm12 +.L_next_16_ok_825: + vshufi64x2 $255,%zmm12,%zmm12,%zmm2 + addb $16,%r15b + + vmovdqu8 256(%rcx,%r11,1),%zmm0 + vmovdqu8 320(%rcx,%r11,1),%zmm3 + vmovdqu8 384(%rcx,%r11,1),%zmm4 + vmovdqu8 448(%rcx,%r11,1),%zmm5 + + + vbroadcastf64x2 0(%rdi),%zmm6 + vpxorq %zmm6,%zmm7,%zmm7 + vpxorq %zmm6,%zmm10,%zmm10 + vpxorq %zmm6,%zmm11,%zmm11 + vpxorq %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 16(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 32(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 48(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 64(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 80(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 96(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 112(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 128(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 144(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 160(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 176(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 192(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 208(%rdi),%zmm6 + vaesenc %zmm6,%zmm7,%zmm7 + vaesenc %zmm6,%zmm10,%zmm10 + vaesenc %zmm6,%zmm11,%zmm11 + vaesenc %zmm6,%zmm12,%zmm12 + vbroadcastf64x2 224(%rdi),%zmm6 + vaesenclast %zmm6,%zmm7,%zmm7 + vaesenclast %zmm6,%zmm10,%zmm10 + vaesenclast %zmm6,%zmm11,%zmm11 + vaesenclast %zmm6,%zmm12,%zmm12 + + + vpxorq %zmm0,%zmm7,%zmm7 + vpxorq %zmm3,%zmm10,%zmm10 + vpxorq %zmm4,%zmm11,%zmm11 + vpxorq %zmm5,%zmm12,%zmm12 + + + movq %r9,%r10 + vmovdqu8 %zmm7,256(%r10,%r11,1) + vmovdqu8 %zmm10,320(%r10,%r11,1) + vmovdqu8 %zmm11,384(%r10,%r11,1) + vmovdqu8 %zmm12,448(%r10,%r11,1) + + vpshufb %zmm29,%zmm0,%zmm7 + vpshufb %zmm29,%zmm3,%zmm10 + vpshufb %zmm29,%zmm4,%zmm11 + vpshufb %zmm29,%zmm5,%zmm12 + vmovdqa64 %zmm7,1024(%rsp) + vmovdqa64 %zmm10,1088(%rsp) + vmovdqa64 %zmm11,1152(%rsp) + vmovdqa64 %zmm12,1216(%rsp) + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_826 + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,192(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,128(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,64(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,0(%rsp) +.L_skip_hkeys_precomputation_826: + movq $1,%r14 + addq $512,%r11 + subq $512,%r8 + + cmpq $768,%r8 + jb .L_no_more_big_nblocks_821 +.L_encrypt_big_nblocks_821: + cmpb $240,%r15b + jae .L_16_blocks_overflow_827 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_827 +.L_16_blocks_overflow_827: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_827: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_828 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_828 +.L_16_blocks_overflow_828: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_828: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_829 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_829 +.L_16_blocks_overflow_829: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_829: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 512(%rcx,%r11,1),%zmm17 + vmovdqu8 576(%rcx,%r11,1),%zmm19 + vmovdqu8 640(%rcx,%r11,1),%zmm20 + vmovdqu8 704(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + + + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpternlogq $0x96,%zmm15,%zmm12,%zmm6 + vpxorq %zmm24,%zmm6,%zmm6 + vpternlogq $0x96,%zmm10,%zmm13,%zmm7 + vpxorq %zmm25,%zmm7,%zmm7 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vextracti64x4 $1,%zmm6,%ymm12 + vpxorq %ymm12,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm12 + vpxorq %xmm12,%xmm6,%xmm6 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm6 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,512(%r10,%r11,1) + vmovdqu8 %zmm3,576(%r10,%r11,1) + vmovdqu8 %zmm4,640(%r10,%r11,1) + vmovdqu8 %zmm5,704(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1024(%rsp) + vmovdqa64 %zmm3,1088(%rsp) + vmovdqa64 %zmm4,1152(%rsp) + vmovdqa64 %zmm5,1216(%rsp) + vmovdqa64 %zmm6,%zmm14 + + addq $768,%r11 + subq $768,%r8 + cmpq $768,%r8 + jae .L_encrypt_big_nblocks_821 + +.L_no_more_big_nblocks_821: + + cmpq $512,%r8 + jae .L_encrypt_32_blocks_821 + + cmpq $256,%r8 + jae .L_encrypt_16_blocks_821 +.L_encrypt_0_blocks_ghash_32_821: + movl %r8d,%r10d + andl $~15,%r10d + movl $256,%ebx + subl %r10d,%ebx + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + addl $256,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_830 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_830 + jb .L_last_num_blocks_is_7_1_830 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_830 + jb .L_last_num_blocks_is_11_9_830 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_830 + ja .L_last_num_blocks_is_16_830 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_830 + jmp .L_last_num_blocks_is_13_830 + +.L_last_num_blocks_is_11_9_830: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_830 + ja .L_last_num_blocks_is_11_830 + jmp .L_last_num_blocks_is_9_830 + +.L_last_num_blocks_is_7_1_830: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_830 + jb .L_last_num_blocks_is_3_1_830 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_830 + je .L_last_num_blocks_is_6_830 + jmp .L_last_num_blocks_is_5_830 + +.L_last_num_blocks_is_3_1_830: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_830 + je .L_last_num_blocks_is_2_830 +.L_last_num_blocks_is_1_830: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_831 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_831 + +.L_16_blocks_overflow_831: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_831: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_832 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_832 +.L_small_initial_partial_block_832: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_832 +.L_small_initial_compute_done_832: +.L_after_reduction_832: + jmp .L_last_blocks_done_830 +.L_last_num_blocks_is_2_830: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_833 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_833 + +.L_16_blocks_overflow_833: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_833: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_834 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_834 +.L_small_initial_partial_block_834: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_834: + + orq %r8,%r8 + je .L_after_reduction_834 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_834: + jmp .L_last_blocks_done_830 +.L_last_num_blocks_is_3_830: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_835 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_835 + +.L_16_blocks_overflow_835: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_835: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_836 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_836 +.L_small_initial_partial_block_836: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_836: + + orq %r8,%r8 + je .L_after_reduction_836 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_836: + jmp .L_last_blocks_done_830 +.L_last_num_blocks_is_4_830: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_837 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_837 + +.L_16_blocks_overflow_837: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_837: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_838 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_838 +.L_small_initial_partial_block_838: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_838: + + orq %r8,%r8 + je .L_after_reduction_838 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_838: + jmp .L_last_blocks_done_830 +.L_last_num_blocks_is_5_830: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_839 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_839 + +.L_16_blocks_overflow_839: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_839: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_840 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_840 +.L_small_initial_partial_block_840: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_840: + + orq %r8,%r8 + je .L_after_reduction_840 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_840: + jmp .L_last_blocks_done_830 +.L_last_num_blocks_is_6_830: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_841 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_841 + +.L_16_blocks_overflow_841: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_841: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_842 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_842 +.L_small_initial_partial_block_842: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_842: + + orq %r8,%r8 + je .L_after_reduction_842 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_842: + jmp .L_last_blocks_done_830 +.L_last_num_blocks_is_7_830: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_843 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_843 + +.L_16_blocks_overflow_843: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_843: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_844 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_844 +.L_small_initial_partial_block_844: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_844: + + orq %r8,%r8 + je .L_after_reduction_844 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_844: + jmp .L_last_blocks_done_830 +.L_last_num_blocks_is_8_830: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_845 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_845 + +.L_16_blocks_overflow_845: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_845: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_846 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_846 +.L_small_initial_partial_block_846: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_846: + + orq %r8,%r8 + je .L_after_reduction_846 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_846: + jmp .L_last_blocks_done_830 +.L_last_num_blocks_is_9_830: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_847 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_847 + +.L_16_blocks_overflow_847: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_847: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_848 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_848 +.L_small_initial_partial_block_848: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_848: + + orq %r8,%r8 + je .L_after_reduction_848 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_848: + jmp .L_last_blocks_done_830 +.L_last_num_blocks_is_10_830: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_849 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_849 + +.L_16_blocks_overflow_849: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_849: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_850 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_850 +.L_small_initial_partial_block_850: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_850: + + orq %r8,%r8 + je .L_after_reduction_850 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_850: + jmp .L_last_blocks_done_830 +.L_last_num_blocks_is_11_830: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_851 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_851 + +.L_16_blocks_overflow_851: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_851: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_852 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_852 +.L_small_initial_partial_block_852: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_852: + + orq %r8,%r8 + je .L_after_reduction_852 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_852: + jmp .L_last_blocks_done_830 +.L_last_num_blocks_is_12_830: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_853 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_853 + +.L_16_blocks_overflow_853: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_853: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_854 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_854 +.L_small_initial_partial_block_854: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_854: + + orq %r8,%r8 + je .L_after_reduction_854 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_854: + jmp .L_last_blocks_done_830 +.L_last_num_blocks_is_13_830: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_855 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_855 + +.L_16_blocks_overflow_855: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_855: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_856 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_856 +.L_small_initial_partial_block_856: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_856: + + orq %r8,%r8 + je .L_after_reduction_856 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_856: + jmp .L_last_blocks_done_830 +.L_last_num_blocks_is_14_830: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_857 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_857 + +.L_16_blocks_overflow_857: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_857: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_858 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_858 +.L_small_initial_partial_block_858: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_858: + + orq %r8,%r8 + je .L_after_reduction_858 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_858: + jmp .L_last_blocks_done_830 +.L_last_num_blocks_is_15_830: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_859 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_859 + +.L_16_blocks_overflow_859: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_859: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_860 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_860 +.L_small_initial_partial_block_860: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_860: + + orq %r8,%r8 + je .L_after_reduction_860 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_860: + jmp .L_last_blocks_done_830 +.L_last_num_blocks_is_16_830: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_861 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_861 + +.L_16_blocks_overflow_861: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_861: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm14,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_862: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_862: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_862: + jmp .L_last_blocks_done_830 +.L_last_num_blocks_is_0_830: + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_830: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_821 +.L_encrypt_32_blocks_821: + cmpb $240,%r15b + jae .L_16_blocks_overflow_863 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_863 +.L_16_blocks_overflow_863: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_863: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + cmpb $240,%r15b + jae .L_16_blocks_overflow_864 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_864 +.L_16_blocks_overflow_864: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_864: + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1024(%rsp),%zmm8 + vmovdqu64 256(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 320(%rsp),%zmm18 + vmovdqa64 1088(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 384(%rsp),%zmm1 + vmovdqa64 1152(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 448(%rsp),%zmm18 + vmovdqa64 1216(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 256(%rcx,%r11,1),%zmm17 + vmovdqu8 320(%rcx,%r11,1),%zmm19 + vmovdqu8 384(%rcx,%r11,1),%zmm20 + vmovdqu8 448(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm15,%zmm10,%zmm26 + vpternlogq $0x96,%zmm12,%zmm6,%zmm24 + vpternlogq $0x96,%zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,256(%r10,%r11,1) + vmovdqu8 %zmm3,320(%r10,%r11,1) + vmovdqu8 %zmm4,384(%r10,%r11,1) + vmovdqu8 %zmm5,448(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,768(%rsp) + vmovdqa64 %zmm3,832(%rsp) + vmovdqa64 %zmm4,896(%rsp) + vmovdqa64 %zmm5,960(%rsp) + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + + subq $512,%r8 + addq $512,%r11 + movl %r8d,%r10d + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_865 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_865 + jb .L_last_num_blocks_is_7_1_865 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_865 + jb .L_last_num_blocks_is_11_9_865 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_865 + ja .L_last_num_blocks_is_16_865 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_865 + jmp .L_last_num_blocks_is_13_865 + +.L_last_num_blocks_is_11_9_865: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_865 + ja .L_last_num_blocks_is_11_865 + jmp .L_last_num_blocks_is_9_865 + +.L_last_num_blocks_is_7_1_865: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_865 + jb .L_last_num_blocks_is_3_1_865 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_865 + je .L_last_num_blocks_is_6_865 + jmp .L_last_num_blocks_is_5_865 + +.L_last_num_blocks_is_3_1_865: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_865 + je .L_last_num_blocks_is_2_865 +.L_last_num_blocks_is_1_865: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_866 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_866 + +.L_16_blocks_overflow_866: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_866: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_867 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_867 +.L_small_initial_partial_block_867: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_867 +.L_small_initial_compute_done_867: +.L_after_reduction_867: + jmp .L_last_blocks_done_865 +.L_last_num_blocks_is_2_865: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_868 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_868 + +.L_16_blocks_overflow_868: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_868: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_869 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_869 +.L_small_initial_partial_block_869: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_869: + + orq %r8,%r8 + je .L_after_reduction_869 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_869: + jmp .L_last_blocks_done_865 +.L_last_num_blocks_is_3_865: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_870 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_870 + +.L_16_blocks_overflow_870: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_870: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_871 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_871 +.L_small_initial_partial_block_871: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_871: + + orq %r8,%r8 + je .L_after_reduction_871 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_871: + jmp .L_last_blocks_done_865 +.L_last_num_blocks_is_4_865: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_872 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_872 + +.L_16_blocks_overflow_872: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_872: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_873 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_873 +.L_small_initial_partial_block_873: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_873: + + orq %r8,%r8 + je .L_after_reduction_873 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_873: + jmp .L_last_blocks_done_865 +.L_last_num_blocks_is_5_865: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_874 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_874 + +.L_16_blocks_overflow_874: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_874: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_875 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_875 +.L_small_initial_partial_block_875: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_875: + + orq %r8,%r8 + je .L_after_reduction_875 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_875: + jmp .L_last_blocks_done_865 +.L_last_num_blocks_is_6_865: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_876 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_876 + +.L_16_blocks_overflow_876: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_876: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_877 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_877 +.L_small_initial_partial_block_877: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_877: + + orq %r8,%r8 + je .L_after_reduction_877 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_877: + jmp .L_last_blocks_done_865 +.L_last_num_blocks_is_7_865: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_878 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_878 + +.L_16_blocks_overflow_878: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_878: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_879 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_879 +.L_small_initial_partial_block_879: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_879: + + orq %r8,%r8 + je .L_after_reduction_879 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_879: + jmp .L_last_blocks_done_865 +.L_last_num_blocks_is_8_865: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_880 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_880 + +.L_16_blocks_overflow_880: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_880: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_881 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_881 +.L_small_initial_partial_block_881: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_881: + + orq %r8,%r8 + je .L_after_reduction_881 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_881: + jmp .L_last_blocks_done_865 +.L_last_num_blocks_is_9_865: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_882 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_882 + +.L_16_blocks_overflow_882: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_882: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_883 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_883 +.L_small_initial_partial_block_883: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_883: + + orq %r8,%r8 + je .L_after_reduction_883 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_883: + jmp .L_last_blocks_done_865 +.L_last_num_blocks_is_10_865: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_884 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_884 + +.L_16_blocks_overflow_884: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_884: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_885 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_885 +.L_small_initial_partial_block_885: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_885: + + orq %r8,%r8 + je .L_after_reduction_885 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_885: + jmp .L_last_blocks_done_865 +.L_last_num_blocks_is_11_865: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_886 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_886 + +.L_16_blocks_overflow_886: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_886: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_887 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_887 +.L_small_initial_partial_block_887: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_887: + + orq %r8,%r8 + je .L_after_reduction_887 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_887: + jmp .L_last_blocks_done_865 +.L_last_num_blocks_is_12_865: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_888 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_888 + +.L_16_blocks_overflow_888: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_888: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_889 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_889 +.L_small_initial_partial_block_889: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_889: + + orq %r8,%r8 + je .L_after_reduction_889 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_889: + jmp .L_last_blocks_done_865 +.L_last_num_blocks_is_13_865: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_890 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_890 + +.L_16_blocks_overflow_890: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_890: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_891 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_891 +.L_small_initial_partial_block_891: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_891: + + orq %r8,%r8 + je .L_after_reduction_891 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_891: + jmp .L_last_blocks_done_865 +.L_last_num_blocks_is_14_865: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_892 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_892 + +.L_16_blocks_overflow_892: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_892: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_893 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_893 +.L_small_initial_partial_block_893: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_893: + + orq %r8,%r8 + je .L_after_reduction_893 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_893: + jmp .L_last_blocks_done_865 +.L_last_num_blocks_is_15_865: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_894 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_894 + +.L_16_blocks_overflow_894: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_894: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_895 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_895 +.L_small_initial_partial_block_895: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_895: + + orq %r8,%r8 + je .L_after_reduction_895 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_895: + jmp .L_last_blocks_done_865 +.L_last_num_blocks_is_16_865: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_896 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_896 + +.L_16_blocks_overflow_896: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_896: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_897: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_897: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_897: + jmp .L_last_blocks_done_865 +.L_last_num_blocks_is_0_865: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_865: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_821 +.L_encrypt_16_blocks_821: + cmpb $240,%r15b + jae .L_16_blocks_overflow_898 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_898 +.L_16_blocks_overflow_898: + vpshufb %zmm29,%zmm2,%zmm2 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_898: + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp),%zmm1 + + + + + vshufi64x2 $255,%zmm5,%zmm5,%zmm2 + addb $16,%r15b + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + + + + + + + + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm6 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + + + + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21 + + + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm15,%zmm10,%zmm26 + vpxorq %zmm12,%zmm6,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + + + + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + + + + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1) + vpshufb %zmm29,%zmm17,%zmm0 + vpshufb %zmm29,%zmm19,%zmm3 + vpshufb %zmm29,%zmm20,%zmm4 + vpshufb %zmm29,%zmm21,%zmm5 + vmovdqa64 %zmm0,1280(%rsp) + vmovdqa64 %zmm3,1344(%rsp) + vmovdqa64 %zmm4,1408(%rsp) + vmovdqa64 %zmm5,1472(%rsp) + vmovdqa64 1024(%rsp),%zmm13 + vmovdqu64 256(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1088(%rsp),%zmm13 + vmovdqu64 320(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1152(%rsp),%zmm13 + vmovdqu64 384(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1216(%rsp),%zmm13 + vmovdqu64 448(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_899 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_899 + jb .L_last_num_blocks_is_7_1_899 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_899 + jb .L_last_num_blocks_is_11_9_899 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_899 + ja .L_last_num_blocks_is_16_899 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_899 + jmp .L_last_num_blocks_is_13_899 + +.L_last_num_blocks_is_11_9_899: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_899 + ja .L_last_num_blocks_is_11_899 + jmp .L_last_num_blocks_is_9_899 + +.L_last_num_blocks_is_7_1_899: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_899 + jb .L_last_num_blocks_is_3_1_899 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_899 + je .L_last_num_blocks_is_6_899 + jmp .L_last_num_blocks_is_5_899 + +.L_last_num_blocks_is_3_1_899: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_899 + je .L_last_num_blocks_is_2_899 +.L_last_num_blocks_is_1_899: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_900 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_900 + +.L_16_blocks_overflow_900: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_900: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %xmm31,%xmm0,%xmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_901 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_901 +.L_small_initial_partial_block_901: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_901 +.L_small_initial_compute_done_901: +.L_after_reduction_901: + jmp .L_last_blocks_done_899 +.L_last_num_blocks_is_2_899: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_902 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_902 + +.L_16_blocks_overflow_902: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_902: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %ymm31,%ymm0,%ymm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_903 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_903 +.L_small_initial_partial_block_903: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_903: + + orq %r8,%r8 + je .L_after_reduction_903 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_903: + jmp .L_last_blocks_done_899 +.L_last_num_blocks_is_3_899: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_904 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_904 + +.L_16_blocks_overflow_904: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_904: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_905 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_905 +.L_small_initial_partial_block_905: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_905: + + orq %r8,%r8 + je .L_after_reduction_905 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_905: + jmp .L_last_blocks_done_899 +.L_last_num_blocks_is_4_899: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_906 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_906 + +.L_16_blocks_overflow_906: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_906: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_907 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_907 +.L_small_initial_partial_block_907: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_907: + + orq %r8,%r8 + je .L_after_reduction_907 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_907: + jmp .L_last_blocks_done_899 +.L_last_num_blocks_is_5_899: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_908 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_908 + +.L_16_blocks_overflow_908: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_908: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_909 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_909 +.L_small_initial_partial_block_909: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_909: + + orq %r8,%r8 + je .L_after_reduction_909 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_909: + jmp .L_last_blocks_done_899 +.L_last_num_blocks_is_6_899: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_910 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_910 + +.L_16_blocks_overflow_910: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_910: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_911 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_911 +.L_small_initial_partial_block_911: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_911: + + orq %r8,%r8 + je .L_after_reduction_911 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_911: + jmp .L_last_blocks_done_899 +.L_last_num_blocks_is_7_899: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_912 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_912 + +.L_16_blocks_overflow_912: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_912: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_913 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_913 +.L_small_initial_partial_block_913: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_913: + + orq %r8,%r8 + je .L_after_reduction_913 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_913: + jmp .L_last_blocks_done_899 +.L_last_num_blocks_is_8_899: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_914 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_914 + +.L_16_blocks_overflow_914: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_914: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_915 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_915 +.L_small_initial_partial_block_915: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_915: + + orq %r8,%r8 + je .L_after_reduction_915 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_915: + jmp .L_last_blocks_done_899 +.L_last_num_blocks_is_9_899: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_916 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_916 + +.L_16_blocks_overflow_916: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_916: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_917 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_917 +.L_small_initial_partial_block_917: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_917: + + orq %r8,%r8 + je .L_after_reduction_917 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_917: + jmp .L_last_blocks_done_899 +.L_last_num_blocks_is_10_899: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_918 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_918 + +.L_16_blocks_overflow_918: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_918: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_919 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_919 +.L_small_initial_partial_block_919: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_919: + + orq %r8,%r8 + je .L_after_reduction_919 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_919: + jmp .L_last_blocks_done_899 +.L_last_num_blocks_is_11_899: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_920 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_920 + +.L_16_blocks_overflow_920: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_920: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_921 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_921 +.L_small_initial_partial_block_921: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_921: + + orq %r8,%r8 + je .L_after_reduction_921 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_921: + jmp .L_last_blocks_done_899 +.L_last_num_blocks_is_12_899: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_922 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_922 + +.L_16_blocks_overflow_922: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_922: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_923 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_923 +.L_small_initial_partial_block_923: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_923: + + orq %r8,%r8 + je .L_after_reduction_923 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_923: + jmp .L_last_blocks_done_899 +.L_last_num_blocks_is_13_899: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_924 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_924 + +.L_16_blocks_overflow_924: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_924: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_925 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_925 +.L_small_initial_partial_block_925: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_925: + + orq %r8,%r8 + je .L_after_reduction_925 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_925: + jmp .L_last_blocks_done_899 +.L_last_num_blocks_is_14_899: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_926 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_926 + +.L_16_blocks_overflow_926: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_926: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_927 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_927 +.L_small_initial_partial_block_927: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_927: + + orq %r8,%r8 + je .L_after_reduction_927 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_927: + jmp .L_last_blocks_done_899 +.L_last_num_blocks_is_15_899: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_928 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_928 + +.L_16_blocks_overflow_928: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_928: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_929 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_929 +.L_small_initial_partial_block_929: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_929: + + orq %r8,%r8 + je .L_after_reduction_929 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_929: + jmp .L_last_blocks_done_899 +.L_last_num_blocks_is_16_899: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_930 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_930 + +.L_16_blocks_overflow_930: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_930: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vmovdqa64 1280(%rsp),%zmm8 + vmovdqu64 512(%rsp),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 576(%rsp),%zmm18 + vmovdqa64 1344(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 640(%rsp),%zmm1 + vmovdqa64 1408(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 704(%rsp),%zmm18 + vmovdqa64 1472(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpternlogq $0x96,%zmm12,%zmm24,%zmm14 + vpternlogq $0x96,%zmm13,%zmm25,%zmm7 + vpternlogq $0x96,%zmm15,%zmm26,%zmm10 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vpsrldq $8,%zmm10,%zmm15 + vpslldq $8,%zmm10,%zmm10 + + vmovdqa64 POLY2(%rip),%xmm16 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vpxorq %zmm15,%zmm14,%zmm14 + vpxorq %zmm10,%zmm7,%zmm7 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vextracti64x4 $1,%zmm14,%ymm12 + vpxorq %ymm12,%ymm14,%ymm14 + vextracti32x4 $1,%ymm14,%xmm12 + vpxorq %xmm12,%xmm14,%xmm14 + vextracti64x4 $1,%zmm7,%ymm13 + vpxorq %ymm13,%ymm7,%ymm7 + vextracti32x4 $1,%ymm7,%xmm13 + vpxorq %xmm13,%xmm7,%xmm7 + vbroadcastf64x2 176(%rdi),%zmm31 + vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13 + vpslldq $8,%xmm13,%xmm13 + vpxorq %xmm13,%xmm7,%xmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12 + vpsrldq $4,%xmm12,%xmm12 + vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15 + vpslldq $4,%xmm15,%xmm15 + + vpternlogq $0x96,%xmm12,%xmm15,%xmm14 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_931: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vpxorq %zmm14,%zmm17,%zmm17 + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm31,%zmm5,%zmm5 + vpxorq %zmm8,%zmm0,%zmm0 + vpxorq %zmm22,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_931: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_931: + jmp .L_last_blocks_done_899 +.L_last_num_blocks_is_0_899: + vmovdqa64 1280(%rsp),%zmm13 + vmovdqu64 512(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1344(%rsp),%zmm13 + vmovdqu64 576(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 1408(%rsp),%zmm13 + vmovdqu64 640(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 1472(%rsp),%zmm13 + vmovdqu64 704(%rsp),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_899: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_821 + +.L_message_below_32_blocks_821: + + + subq $256,%r8 + addq $256,%r11 + movl %r8d,%r10d + testq %r14,%r14 + jnz .L_skip_hkeys_precomputation_932 + vmovdqu64 640(%rsp),%zmm3 + + + vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3 + + vmovdqu64 576(%rsp),%zmm4 + vmovdqu64 512(%rsp),%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,448(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,384(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm10,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm10 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm4 + + vmovdqu64 %zmm4,320(%rsp) + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm10,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm10 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm10,%zmm6,%zmm6 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm10 + + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7 + vpslldq $8,%zmm7,%zmm7 + vpxorq %zmm7,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7 + vpsrldq $4,%zmm7,%zmm7 + vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm7,%zmm6,%zmm5 + + vmovdqu64 %zmm5,256(%rsp) +.L_skip_hkeys_precomputation_932: + movq $1,%r14 + andl $~15,%r10d + movl $512,%ebx + subl %r10d,%ebx + movl %r8d,%r10d + addl $15,%r10d + shrl $4,%r10d + je .L_last_num_blocks_is_0_933 + + cmpl $8,%r10d + je .L_last_num_blocks_is_8_933 + jb .L_last_num_blocks_is_7_1_933 + + + cmpl $12,%r10d + je .L_last_num_blocks_is_12_933 + jb .L_last_num_blocks_is_11_9_933 + + + cmpl $15,%r10d + je .L_last_num_blocks_is_15_933 + ja .L_last_num_blocks_is_16_933 + cmpl $14,%r10d + je .L_last_num_blocks_is_14_933 + jmp .L_last_num_blocks_is_13_933 + +.L_last_num_blocks_is_11_9_933: + + cmpl $10,%r10d + je .L_last_num_blocks_is_10_933 + ja .L_last_num_blocks_is_11_933 + jmp .L_last_num_blocks_is_9_933 + +.L_last_num_blocks_is_7_1_933: + cmpl $4,%r10d + je .L_last_num_blocks_is_4_933 + jb .L_last_num_blocks_is_3_1_933 + + cmpl $6,%r10d + ja .L_last_num_blocks_is_7_933 + je .L_last_num_blocks_is_6_933 + jmp .L_last_num_blocks_is_5_933 + +.L_last_num_blocks_is_3_1_933: + + cmpl $2,%r10d + ja .L_last_num_blocks_is_3_933 + je .L_last_num_blocks_is_2_933 +.L_last_num_blocks_is_1_933: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $255,%r15d + jae .L_16_blocks_overflow_934 + vpaddd %xmm28,%xmm2,%xmm0 + jmp .L_16_blocks_ok_934 + +.L_16_blocks_overflow_934: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %xmm29,%xmm0,%xmm0 +.L_16_blocks_ok_934: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z} + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %xmm30,%xmm0,%xmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %xmm31,%xmm0,%xmm0 + vaesenclast %xmm30,%xmm0,%xmm0 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %xmm29,%xmm17,%xmm17 + vextracti32x4 $0,%zmm17,%xmm7 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_935 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_935 +.L_small_initial_partial_block_935: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm0 + + + vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3 + vpslldq $8,%xmm3,%xmm3 + vpxorq %xmm3,%xmm25,%xmm3 + + + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm4,%xmm14 + + + + + + + + + + + + + vpxorq %xmm7,%xmm14,%xmm14 + + jmp .L_after_reduction_935 +.L_small_initial_compute_done_935: +.L_after_reduction_935: + jmp .L_last_blocks_done_933 +.L_last_num_blocks_is_2_933: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $254,%r15d + jae .L_16_blocks_overflow_936 + vpaddd %ymm28,%ymm2,%ymm0 + jmp .L_16_blocks_ok_936 + +.L_16_blocks_overflow_936: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %ymm29,%ymm0,%ymm0 +.L_16_blocks_ok_936: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z} + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %ymm30,%ymm0,%ymm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %ymm31,%ymm0,%ymm0 + vaesenclast %ymm30,%ymm0,%ymm0 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %ymm29,%ymm17,%ymm17 + vextracti32x4 $1,%zmm17,%xmm7 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_937 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_937 +.L_small_initial_partial_block_937: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_937: + + orq %r8,%r8 + je .L_after_reduction_937 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_937: + jmp .L_last_blocks_done_933 +.L_last_num_blocks_is_3_933: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $253,%r15d + jae .L_16_blocks_overflow_938 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_938 + +.L_16_blocks_overflow_938: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_938: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $2,%zmm17,%xmm7 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_939 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_939 +.L_small_initial_partial_block_939: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_939: + + orq %r8,%r8 + je .L_after_reduction_939 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_939: + jmp .L_last_blocks_done_933 +.L_last_num_blocks_is_4_933: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $252,%r15d + jae .L_16_blocks_overflow_940 + vpaddd %zmm28,%zmm2,%zmm0 + jmp .L_16_blocks_ok_940 + +.L_16_blocks_overflow_940: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpshufb %zmm29,%zmm0,%zmm0 +.L_16_blocks_ok_940: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm0,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm0,%zmm0 + vpxorq %zmm17,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm17,%zmm17{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vextracti32x4 $3,%zmm17,%xmm7 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_941 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_941 +.L_small_initial_partial_block_941: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpxorq %zmm26,%zmm4,%zmm4 + vpxorq %zmm24,%zmm0,%zmm0 + vpxorq %zmm25,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_941: + + orq %r8,%r8 + je .L_after_reduction_941 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_941: + jmp .L_last_blocks_done_933 +.L_last_num_blocks_is_5_933: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $251,%r15d + jae .L_16_blocks_overflow_942 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %xmm27,%xmm0,%xmm3 + jmp .L_16_blocks_ok_942 + +.L_16_blocks_overflow_942: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 +.L_16_blocks_ok_942: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %xmm30,%xmm3,%xmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %xmm31,%xmm3,%xmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %xmm30,%xmm3,%xmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %xmm19,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %xmm29,%xmm19,%xmm19 + vextracti32x4 $0,%zmm19,%xmm7 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_943 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_943 +.L_small_initial_partial_block_943: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_943: + + orq %r8,%r8 + je .L_after_reduction_943 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_943: + jmp .L_last_blocks_done_933 +.L_last_num_blocks_is_6_933: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $250,%r15d + jae .L_16_blocks_overflow_944 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %ymm27,%ymm0,%ymm3 + jmp .L_16_blocks_ok_944 + +.L_16_blocks_overflow_944: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 +.L_16_blocks_ok_944: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %ymm30,%ymm3,%ymm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %ymm31,%ymm3,%ymm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %ymm30,%ymm3,%ymm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %ymm29,%ymm19,%ymm19 + vextracti32x4 $1,%zmm19,%xmm7 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_945 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_945 +.L_small_initial_partial_block_945: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_945: + + orq %r8,%r8 + je .L_after_reduction_945 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_945: + jmp .L_last_blocks_done_933 +.L_last_num_blocks_is_7_933: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $249,%r15d + jae .L_16_blocks_overflow_946 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_946 + +.L_16_blocks_overflow_946: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_946: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $2,%zmm19,%xmm7 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_947 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_947 +.L_small_initial_partial_block_947: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_947: + + orq %r8,%r8 + je .L_after_reduction_947 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_947: + jmp .L_last_blocks_done_933 +.L_last_num_blocks_is_8_933: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $64,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $248,%r15d + jae .L_16_blocks_overflow_948 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + jmp .L_16_blocks_ok_948 + +.L_16_blocks_overflow_948: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 +.L_16_blocks_ok_948: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm3,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm19,%zmm19{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vextracti32x4 $3,%zmm19,%xmm7 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_949 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_949 +.L_small_initial_partial_block_949: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_949: + + orq %r8,%r8 + je .L_after_reduction_949 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_949: + jmp .L_last_blocks_done_933 +.L_last_num_blocks_is_9_933: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $247,%r15d + jae .L_16_blocks_overflow_950 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %xmm27,%xmm3,%xmm4 + jmp .L_16_blocks_ok_950 + +.L_16_blocks_overflow_950: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 +.L_16_blocks_ok_950: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %xmm30,%xmm4,%xmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %xmm31,%xmm4,%xmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %xmm30,%xmm4,%xmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %xmm20,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %xmm29,%xmm20,%xmm20 + vextracti32x4 $0,%zmm20,%xmm7 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_951 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_951 +.L_small_initial_partial_block_951: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_951: + + orq %r8,%r8 + je .L_after_reduction_951 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_951: + jmp .L_last_blocks_done_933 +.L_last_num_blocks_is_10_933: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $246,%r15d + jae .L_16_blocks_overflow_952 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %ymm27,%ymm3,%ymm4 + jmp .L_16_blocks_ok_952 + +.L_16_blocks_overflow_952: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 +.L_16_blocks_ok_952: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %ymm30,%ymm4,%ymm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %ymm31,%ymm4,%ymm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %ymm30,%ymm4,%ymm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %ymm20,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %ymm29,%ymm20,%ymm20 + vextracti32x4 $1,%zmm20,%xmm7 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_953 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_953 +.L_small_initial_partial_block_953: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_953: + + orq %r8,%r8 + je .L_after_reduction_953 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_953: + jmp .L_last_blocks_done_933 +.L_last_num_blocks_is_11_933: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $245,%r15d + jae .L_16_blocks_overflow_954 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_954 + +.L_16_blocks_overflow_954: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_954: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $2,%zmm20,%xmm7 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_955 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_955 +.L_small_initial_partial_block_955: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_955: + + orq %r8,%r8 + je .L_after_reduction_955 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_955: + jmp .L_last_blocks_done_933 +.L_last_num_blocks_is_12_933: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $128,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $244,%r15d + jae .L_16_blocks_overflow_956 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + jmp .L_16_blocks_ok_956 + +.L_16_blocks_overflow_956: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 +.L_16_blocks_ok_956: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm4,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm20,%zmm20{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vextracti32x4 $3,%zmm20,%xmm7 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_957 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_957 +.L_small_initial_partial_block_957: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vpxorq %zmm8,%zmm0,%zmm8 + vpxorq %zmm22,%zmm3,%zmm22 + vpxorq %zmm30,%zmm4,%zmm30 + vpxorq %zmm31,%zmm5,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_957: + + orq %r8,%r8 + je .L_after_reduction_957 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_957: + jmp .L_last_blocks_done_933 +.L_last_num_blocks_is_13_933: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $243,%r15d + jae .L_16_blocks_overflow_958 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %xmm27,%xmm4,%xmm5 + jmp .L_16_blocks_ok_958 + +.L_16_blocks_overflow_958: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 +.L_16_blocks_ok_958: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $0,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %xmm30,%xmm5,%xmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %xmm31,%xmm5,%xmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %xmm30,%xmm5,%xmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %xmm21,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %xmm29,%xmm21,%xmm21 + vextracti32x4 $0,%zmm21,%xmm7 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_959 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_959 +.L_small_initial_partial_block_959: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 160(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 224(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 288(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + + vpxorq %zmm26,%zmm30,%zmm30 + vpxorq %zmm24,%zmm8,%zmm8 + vpxorq %zmm25,%zmm22,%zmm22 + + vpxorq %zmm31,%zmm30,%zmm30 + vpsrldq $8,%zmm30,%zmm4 + vpslldq $8,%zmm30,%zmm5 + vpxorq %zmm4,%zmm8,%zmm0 + vpxorq %zmm5,%zmm22,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_959: + + orq %r8,%r8 + je .L_after_reduction_959 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_959: + jmp .L_last_blocks_done_933 +.L_last_num_blocks_is_14_933: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $242,%r15d + jae .L_16_blocks_overflow_960 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %ymm27,%ymm4,%ymm5 + jmp .L_16_blocks_ok_960 + +.L_16_blocks_overflow_960: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 +.L_16_blocks_ok_960: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $1,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %ymm30,%ymm5,%ymm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %ymm31,%ymm5,%ymm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %ymm30,%ymm5,%ymm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %ymm21,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %ymm29,%ymm21,%ymm21 + vextracti32x4 $1,%zmm21,%xmm7 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_961 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_961 +.L_small_initial_partial_block_961: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 144(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 208(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 272(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 336(%rsi),%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5 + vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0 + vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_961: + + orq %r8,%r8 + je .L_after_reduction_961 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_961: + jmp .L_last_blocks_done_933 +.L_last_num_blocks_is_15_933: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $241,%r15d + jae .L_16_blocks_overflow_962 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_962 + +.L_16_blocks_overflow_962: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_962: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $2,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $2,%zmm21,%xmm7 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_963 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_963 +.L_small_initial_partial_block_963: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 128(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 192(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 256(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 320(%rsi),%ymm1 + vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4 + vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5 + vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0 + vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_963: + + orq %r8,%r8 + je .L_after_reduction_963 + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_963: + jmp .L_last_blocks_done_933 +.L_last_num_blocks_is_16_933: + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%rax + subq $192,%rax + kmovq (%r10,%rax,8),%k1 + cmpl $240,%r15d + jae .L_16_blocks_overflow_964 + vpaddd %zmm28,%zmm2,%zmm0 + vpaddd %zmm27,%zmm0,%zmm3 + vpaddd %zmm27,%zmm3,%zmm4 + vpaddd %zmm27,%zmm4,%zmm5 + jmp .L_16_blocks_ok_964 + +.L_16_blocks_overflow_964: + vpshufb %zmm29,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vmovdqa64 ddq_add_4444(%rip),%zmm5 + vpaddd %zmm5,%zmm0,%zmm3 + vpaddd %zmm5,%zmm3,%zmm4 + vpaddd %zmm5,%zmm4,%zmm5 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 +.L_16_blocks_ok_964: + + + + + vbroadcastf64x2 0(%rdi),%zmm30 + vpxorq 768(%rsp),%zmm14,%zmm8 + vmovdqu64 0(%rsp,%rbx,1),%zmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + + + vbroadcastf64x2 16(%rdi),%zmm31 + vmovdqu64 64(%rsp,%rbx,1),%zmm18 + vmovdqa64 832(%rsp),%zmm22 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm30,%zmm3,%zmm3 + vpxorq %zmm30,%zmm4,%zmm4 + vpxorq %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm30 + + + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10 + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11 + vmovdqu64 128(%rsp,%rbx,1),%zmm1 + vmovdqa64 896(%rsp),%zmm8 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm18 + vmovdqa64 960(%rsp),%zmm22 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm30 + + + vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20 + vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21 + vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm31 + + + vpternlogq $0x96,%zmm17,%zmm12,%zmm14 + vpternlogq $0x96,%zmm19,%zmm13,%zmm7 + vpternlogq $0x96,%zmm21,%zmm16,%zmm11 + vpternlogq $0x96,%zmm20,%zmm15,%zmm10 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm30 + vmovdqu8 0(%rcx,%r11,1),%zmm17 + vmovdqu8 64(%rcx,%r11,1),%zmm19 + vmovdqu8 128(%rcx,%r11,1),%zmm20 + vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z} + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm31 + + + vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15 + vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16 + vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12 + vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm30 + vpternlogq $0x96,%zmm16,%zmm11,%zmm10 + vpxorq %zmm12,%zmm14,%zmm24 + vpxorq %zmm13,%zmm7,%zmm25 + vpxorq %zmm15,%zmm10,%zmm26 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm31 + vaesenc %zmm30,%zmm0,%zmm0 + vaesenc %zmm30,%zmm3,%zmm3 + vaesenc %zmm30,%zmm4,%zmm4 + vaesenc %zmm30,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm30 + vaesenc %zmm31,%zmm0,%zmm0 + vaesenc %zmm31,%zmm3,%zmm3 + vaesenc %zmm31,%zmm4,%zmm4 + vaesenc %zmm31,%zmm5,%zmm5 + vaesenclast %zmm30,%zmm0,%zmm0 + vaesenclast %zmm30,%zmm3,%zmm3 + vaesenclast %zmm30,%zmm4,%zmm4 + vaesenclast %zmm30,%zmm5,%zmm5 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vpxorq %zmm20,%zmm4,%zmm4 + vpxorq %zmm21,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm11 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm21,%zmm21{%k1}{z} + vpshufb %zmm29,%zmm17,%zmm17 + vpshufb %zmm29,%zmm19,%zmm19 + vpshufb %zmm29,%zmm20,%zmm20 + vpshufb %zmm29,%zmm21,%zmm21 + vextracti32x4 $3,%zmm21,%xmm7 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_965: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm11,16(%rsi) + vmovdqu64 112(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5 + vmovdqu64 176(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8 + vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30 + vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31 + vmovdqu64 240(%rsi),%zmm1 + vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm0,%zmm17,%zmm8 + vpternlogq $0x96,%zmm3,%zmm19,%zmm22 + vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17 + vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19 + vpternlogq $0x96,%zmm4,%zmm17,%zmm30 + vpternlogq $0x96,%zmm5,%zmm19,%zmm31 + vmovdqu64 304(%rsi),%ymm1 + vinserti64x2 $2,336(%rsi),%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4 + vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5 + vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0 + vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3 + + vpxorq %zmm30,%zmm4,%zmm4 + vpternlogq $0x96,%zmm31,%zmm26,%zmm5 + vpternlogq $0x96,%zmm8,%zmm24,%zmm0 + vpternlogq $0x96,%zmm22,%zmm25,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm30 + vpslldq $8,%zmm4,%zmm31 + vpxorq %zmm30,%zmm0,%zmm0 + vpxorq %zmm31,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm30 + vpxorq %ymm30,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm30 + vpxorq %xmm30,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm31 + vpxorq %ymm31,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm31 + vpxorq %xmm31,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm1 + + + vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_965: + vpxorq %xmm7,%xmm14,%xmm14 +.L_after_reduction_965: + jmp .L_last_blocks_done_933 +.L_last_num_blocks_is_0_933: + vmovdqa64 768(%rsp),%zmm13 + vpxorq %zmm14,%zmm13,%zmm13 + vmovdqu64 0(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 832(%rsp),%zmm13 + vmovdqu64 64(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + vpxorq %zmm10,%zmm4,%zmm26 + vpxorq %zmm6,%zmm0,%zmm24 + vpxorq %zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + vmovdqa64 896(%rsp),%zmm13 + vmovdqu64 128(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5 + vmovdqa64 960(%rsp),%zmm13 + vmovdqu64 192(%rsp,%rbx,1),%zmm12 + vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6 + vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7 + vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10 + vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11 + + vpternlogq $0x96,%zmm10,%zmm4,%zmm26 + vpternlogq $0x96,%zmm6,%zmm0,%zmm24 + vpternlogq $0x96,%zmm7,%zmm3,%zmm25 + vpternlogq $0x96,%zmm11,%zmm5,%zmm26 + + vpsrldq $8,%zmm26,%zmm0 + vpslldq $8,%zmm26,%zmm3 + vpxorq %zmm0,%zmm24,%zmm24 + vpxorq %zmm3,%zmm25,%zmm25 + vextracti64x4 $1,%zmm24,%ymm0 + vpxorq %ymm0,%ymm24,%ymm24 + vextracti32x4 $1,%ymm24,%xmm0 + vpxorq %xmm0,%xmm24,%xmm24 + vextracti64x4 $1,%zmm25,%ymm3 + vpxorq %ymm3,%ymm25,%ymm25 + vextracti32x4 $1,%ymm25,%xmm3 + vpxorq %xmm3,%xmm25,%xmm25 + vmovdqa64 POLY2(%rip),%xmm4 + + + vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0 + vpslldq $8,%xmm0,%xmm0 + vpxorq %xmm0,%xmm25,%xmm0 + + + vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3 + vpsrldq $4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm24,%xmm3,%xmm14 + +.L_last_blocks_done_933: + vpshufb %xmm29,%xmm2,%xmm2 + jmp .L_ghash_done_821 + +.L_message_below_equal_16_blocks_821: + + + movl %r8d,%r12d + addl $15,%r12d + shrl $4,%r12d + cmpq $8,%r12 + je .L_small_initial_num_blocks_is_8_966 + jl .L_small_initial_num_blocks_is_7_1_966 + + + cmpq $12,%r12 + je .L_small_initial_num_blocks_is_12_966 + jl .L_small_initial_num_blocks_is_11_9_966 + + + cmpq $16,%r12 + je .L_small_initial_num_blocks_is_16_966 + cmpq $15,%r12 + je .L_small_initial_num_blocks_is_15_966 + cmpq $14,%r12 + je .L_small_initial_num_blocks_is_14_966 + jmp .L_small_initial_num_blocks_is_13_966 + +.L_small_initial_num_blocks_is_11_9_966: + + cmpq $11,%r12 + je .L_small_initial_num_blocks_is_11_966 + cmpq $10,%r12 + je .L_small_initial_num_blocks_is_10_966 + jmp .L_small_initial_num_blocks_is_9_966 + +.L_small_initial_num_blocks_is_7_1_966: + cmpq $4,%r12 + je .L_small_initial_num_blocks_is_4_966 + jl .L_small_initial_num_blocks_is_3_1_966 + + cmpq $7,%r12 + je .L_small_initial_num_blocks_is_7_966 + cmpq $6,%r12 + je .L_small_initial_num_blocks_is_6_966 + jmp .L_small_initial_num_blocks_is_5_966 + +.L_small_initial_num_blocks_is_3_1_966: + + cmpq $3,%r12 + je .L_small_initial_num_blocks_is_3_966 + cmpq $2,%r12 + je .L_small_initial_num_blocks_is_2_966 + + + + + +.L_small_initial_num_blocks_is_1_966: + vmovdqa64 SHUF_MASK(%rip),%xmm29 + vpaddd ONE(%rip),%xmm2,%xmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm0,%xmm2 + vpshufb %xmm29,%xmm0,%xmm0 + vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %xmm15,%xmm0,%xmm0 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %xmm15,%xmm0,%xmm0 + vpxorq %xmm6,%xmm0,%xmm0 + vextracti32x4 $0,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %xmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %xmm29,%xmm6,%xmm6 + vextracti32x4 $0,%zmm6,%xmm13 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_967 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_967 +.L_small_initial_partial_block_967: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + + + + + + + + + + + + vpxorq %xmm13,%xmm14,%xmm14 + + jmp .L_after_reduction_967 +.L_small_initial_compute_done_967: +.L_after_reduction_967: + jmp .L_small_initial_blocks_encrypted_966 +.L_small_initial_num_blocks_is_2_966: + vmovdqa64 SHUF_MASK(%rip),%ymm29 + vshufi64x2 $0,%ymm2,%ymm2,%ymm0 + vpaddd ddq_add_1234(%rip),%ymm0,%ymm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm0,%xmm2 + vpshufb %ymm29,%ymm0,%ymm0 + vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %ymm15,%ymm0,%ymm0 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %ymm15,%ymm0,%ymm0 + vpxorq %ymm6,%ymm0,%ymm0 + vextracti32x4 $1,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %ymm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %ymm29,%ymm6,%ymm6 + vextracti32x4 $1,%zmm6,%xmm13 + subq $16 * (2 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_968 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_968 +.L_small_initial_partial_block_968: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_968: + + orq %r8,%r8 + je .L_after_reduction_968 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_968: + jmp .L_small_initial_blocks_encrypted_966 +.L_small_initial_num_blocks_is_3_966: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $2,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vextracti32x4 $2,%zmm6,%xmm13 + subq $16 * (3 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_969 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_969 +.L_small_initial_partial_block_969: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_969: + + orq %r8,%r8 + je .L_after_reduction_969 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_969: + jmp .L_small_initial_blocks_encrypted_966 +.L_small_initial_num_blocks_is_4_966: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm0,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vpxorq %zmm6,%zmm0,%zmm0 + vextracti32x4 $3,%zmm0,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1){%k1} + vmovdqu8 %zmm0,%zmm0{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vextracti32x4 $3,%zmm6,%xmm13 + subq $16 * (4 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_970 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_970 +.L_small_initial_partial_block_970: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_970: + + orq %r8,%r8 + je .L_after_reduction_970 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_970: + jmp .L_small_initial_blocks_encrypted_966 +.L_small_initial_num_blocks_is_5_966: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %xmm29,%xmm3,%xmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %xmm15,%xmm3,%xmm3 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %xmm15,%xmm3,%xmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %xmm7,%xmm3,%xmm3 + vextracti32x4 $0,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %xmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %xmm29,%xmm7,%xmm7 + vextracti32x4 $0,%zmm7,%xmm13 + subq $16 * (5 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_971 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_971 +.L_small_initial_partial_block_971: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_971: + + orq %r8,%r8 + je .L_after_reduction_971 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_971: + jmp .L_small_initial_blocks_encrypted_966 +.L_small_initial_num_blocks_is_6_966: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %ymm29,%ymm3,%ymm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %ymm15,%ymm3,%ymm3 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %ymm15,%ymm3,%ymm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %ymm7,%ymm3,%ymm3 + vextracti32x4 $1,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %ymm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %ymm29,%ymm7,%ymm7 + vextracti32x4 $1,%zmm7,%xmm13 + subq $16 * (6 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_972 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_972 +.L_small_initial_partial_block_972: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_972: + + orq %r8,%r8 + je .L_after_reduction_972 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_972: + jmp .L_small_initial_blocks_encrypted_966 +.L_small_initial_num_blocks_is_7_966: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $2,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vextracti32x4 $2,%zmm7,%xmm13 + subq $16 * (7 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_973 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_973 +.L_small_initial_partial_block_973: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_973: + + orq %r8,%r8 + je .L_after_reduction_973 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_973: + jmp .L_small_initial_blocks_encrypted_966 +.L_small_initial_num_blocks_is_8_966: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $64,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm3,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vextracti32x4 $3,%zmm3,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1){%k1} + vmovdqu8 %zmm3,%zmm3{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vextracti32x4 $3,%zmm7,%xmm13 + subq $16 * (8 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_974 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_974 +.L_small_initial_partial_block_974: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_974: + + orq %r8,%r8 + je .L_after_reduction_974 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_974: + jmp .L_small_initial_blocks_encrypted_966 +.L_small_initial_num_blocks_is_9_966: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %xmm29,%xmm4,%xmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %xmm15,%xmm4,%xmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %xmm10,%xmm4,%xmm4 + vextracti32x4 $0,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %xmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %xmm29,%xmm10,%xmm10 + vextracti32x4 $0,%zmm10,%xmm13 + subq $16 * (9 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_975 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_975 +.L_small_initial_partial_block_975: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_975: + + orq %r8,%r8 + je .L_after_reduction_975 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_975: + jmp .L_small_initial_blocks_encrypted_966 +.L_small_initial_num_blocks_is_10_966: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %ymm29,%ymm4,%ymm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %ymm15,%ymm4,%ymm4 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %ymm15,%ymm4,%ymm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %ymm10,%ymm4,%ymm4 + vextracti32x4 $1,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %ymm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %ymm29,%ymm10,%ymm10 + vextracti32x4 $1,%zmm10,%xmm13 + subq $16 * (10 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_976 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_976 +.L_small_initial_partial_block_976: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_976: + + orq %r8,%r8 + je .L_after_reduction_976 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_976: + jmp .L_small_initial_blocks_encrypted_966 +.L_small_initial_num_blocks_is_11_966: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $2,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vextracti32x4 $2,%zmm10,%xmm13 + subq $16 * (11 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_977 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_977 +.L_small_initial_partial_block_977: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_977: + + orq %r8,%r8 + je .L_after_reduction_977 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_977: + jmp .L_small_initial_blocks_encrypted_966 +.L_small_initial_num_blocks_is_12_966: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $128,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm4,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vextracti32x4 $3,%zmm4,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1){%k1} + vmovdqu8 %zmm4,%zmm4{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vextracti32x4 $3,%zmm10,%xmm13 + subq $16 * (12 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_978 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_978 +.L_small_initial_partial_block_978: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vpxorq %zmm15,%zmm0,%zmm15 + vpxorq %zmm16,%zmm3,%zmm16 + vpxorq %zmm17,%zmm4,%zmm17 + vpxorq %zmm19,%zmm5,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_978: + + orq %r8,%r8 + je .L_after_reduction_978 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_978: + jmp .L_small_initial_blocks_encrypted_966 +.L_small_initial_num_blocks_is_13_966: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $0,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %xmm29,%xmm5,%xmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %xmm15,%xmm5,%xmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %xmm11,%xmm5,%xmm5 + vextracti32x4 $0,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %xmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %xmm29,%xmm11,%xmm11 + vextracti32x4 $0,%zmm11,%xmm13 + subq $16 * (13 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_979 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_979 +.L_small_initial_partial_block_979: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 160(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 224(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 288(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + + vpxorq %zmm19,%zmm17,%zmm17 + vpsrldq $8,%zmm17,%zmm4 + vpslldq $8,%zmm17,%zmm5 + vpxorq %zmm4,%zmm15,%zmm0 + vpxorq %zmm5,%zmm16,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_979: + + orq %r8,%r8 + je .L_after_reduction_979 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_979: + jmp .L_small_initial_blocks_encrypted_966 +.L_small_initial_num_blocks_is_14_966: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $1,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %ymm29,%ymm5,%ymm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %ymm15,%ymm5,%ymm5 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %ymm15,%ymm5,%ymm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %ymm11,%ymm5,%ymm5 + vextracti32x4 $1,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %ymm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %ymm29,%ymm11,%ymm11 + vextracti32x4 $1,%zmm11,%xmm13 + subq $16 * (14 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_980 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_980 +.L_small_initial_partial_block_980: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 144(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 208(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 272(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 336(%rsi),%xmm20 + vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4 + vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5 + vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0 + vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_980: + + orq %r8,%r8 + je .L_after_reduction_980 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_980: + jmp .L_small_initial_blocks_encrypted_966 +.L_small_initial_num_blocks_is_15_966: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $2,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $2,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vextracti32x4 $2,%zmm11,%xmm13 + subq $16 * (15 - 1),%r8 + + + cmpq $16,%r8 + jl .L_small_initial_partial_block_981 + + + + + + subq $16,%r8 + movq $0,(%rdx) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + + jmp .L_small_initial_compute_done_981 +.L_small_initial_partial_block_981: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 128(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 192(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 256(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 320(%rsi),%ymm20 + vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4 + vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5 + vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0 + vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_981: + + orq %r8,%r8 + je .L_after_reduction_981 + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_981: + jmp .L_small_initial_blocks_encrypted_966 +.L_small_initial_num_blocks_is_16_966: + vmovdqa64 SHUF_MASK(%rip),%zmm29 + vshufi64x2 $0,%zmm2,%zmm2,%zmm2 + vpaddd ddq_add_1234(%rip),%zmm2,%zmm0 + vpaddd ddq_add_5678(%rip),%zmm2,%zmm3 + vpaddd ddq_add_8888(%rip),%zmm0,%zmm4 + vpaddd ddq_add_8888(%rip),%zmm3,%zmm5 + leaq byte64_len_to_mask_table(%rip),%r10 + movq %r8,%r15 + subq $192,%r15 + kmovq (%r10,%r15,8),%k1 + vextracti32x4 $3,%zmm5,%xmm2 + vpshufb %zmm29,%zmm0,%zmm0 + vpshufb %zmm29,%zmm3,%zmm3 + vpshufb %zmm29,%zmm4,%zmm4 + vpshufb %zmm29,%zmm5,%zmm5 + vmovdqu8 0(%rcx,%r11,1),%zmm6 + vmovdqu8 64(%rcx,%r11,1),%zmm7 + vmovdqu8 128(%rcx,%r11,1),%zmm10 + vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z} + vbroadcastf64x2 0(%rdi),%zmm15 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm15,%zmm3,%zmm3 + vpxorq %zmm15,%zmm4,%zmm4 + vpxorq %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 16(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 32(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 48(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 64(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 80(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 96(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 112(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 128(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 144(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 160(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 176(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 192(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 208(%rdi),%zmm15 + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm3,%zmm3 + vaesenc %zmm15,%zmm4,%zmm4 + vaesenc %zmm15,%zmm5,%zmm5 + vbroadcastf64x2 224(%rdi),%zmm15 + vaesenclast %zmm15,%zmm0,%zmm0 + vaesenclast %zmm15,%zmm3,%zmm3 + vaesenclast %zmm15,%zmm4,%zmm4 + vaesenclast %zmm15,%zmm5,%zmm5 + vpxorq %zmm6,%zmm0,%zmm0 + vpxorq %zmm7,%zmm3,%zmm3 + vpxorq %zmm10,%zmm4,%zmm4 + vpxorq %zmm11,%zmm5,%zmm5 + vextracti32x4 $3,%zmm5,%xmm12 + movq %r9,%r10 + vmovdqu8 %zmm0,0(%r10,%r11,1) + vmovdqu8 %zmm3,64(%r10,%r11,1) + vmovdqu8 %zmm4,128(%r10,%r11,1) + vmovdqu8 %zmm5,192(%r10,%r11,1){%k1} + vmovdqu8 %zmm5,%zmm5{%k1}{z} + vpshufb %zmm29,%zmm6,%zmm6 + vpshufb %zmm29,%zmm7,%zmm7 + vpshufb %zmm29,%zmm10,%zmm10 + vpshufb %zmm29,%zmm11,%zmm11 + vextracti32x4 $3,%zmm11,%xmm13 + subq $16 * (16 - 1),%r8 +.L_small_initial_partial_block_982: + + + + + + + + + movq %r8,(%rdx) + vmovdqu64 %xmm12,16(%rsi) + vpxorq %zmm14,%zmm6,%zmm6 + vmovdqu64 112(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3 + vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5 + vmovdqu64 176(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15 + vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17 + vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19 + vmovdqu64 240(%rsi),%zmm20 + vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm0,%zmm6,%zmm15 + vpternlogq $0x96,%zmm3,%zmm7,%zmm16 + vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6 + vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7 + vpternlogq $0x96,%zmm4,%zmm6,%zmm17 + vpternlogq $0x96,%zmm5,%zmm7,%zmm19 + vmovdqu64 304(%rsi),%ymm20 + vinserti64x2 $2,336(%rsi),%zmm20,%zmm20 + vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4 + vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5 + vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0 + vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3 + + vpxorq %zmm17,%zmm4,%zmm4 + vpxorq %zmm19,%zmm5,%zmm5 + vpxorq %zmm15,%zmm0,%zmm0 + vpxorq %zmm16,%zmm3,%zmm3 + + vpxorq %zmm5,%zmm4,%zmm4 + vpsrldq $8,%zmm4,%zmm17 + vpslldq $8,%zmm4,%zmm19 + vpxorq %zmm17,%zmm0,%zmm0 + vpxorq %zmm19,%zmm3,%zmm3 + vextracti64x4 $1,%zmm0,%ymm17 + vpxorq %ymm17,%ymm0,%ymm0 + vextracti32x4 $1,%ymm0,%xmm17 + vpxorq %xmm17,%xmm0,%xmm0 + vextracti64x4 $1,%zmm3,%ymm19 + vpxorq %ymm19,%ymm3,%ymm3 + vextracti32x4 $1,%ymm3,%xmm19 + vpxorq %xmm19,%xmm3,%xmm3 + vmovdqa64 POLY2(%rip),%xmm20 + + + vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm3,%xmm4 + + + vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5 + vpsrldq $4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14 + vpslldq $4,%xmm14,%xmm14 + vpternlogq $0x96,%xmm0,%xmm5,%xmm14 + +.L_small_initial_compute_done_982: + vpxorq %xmm13,%xmm14,%xmm14 +.L_after_reduction_982: +.L_small_initial_blocks_encrypted_966: +.L_ghash_done_821: + vmovdqu64 %xmm2,0(%rsi) + vmovdqu64 %xmm14,64(%rsi) +.L_enc_dec_done_821: + jmp .Lexit_gcm_decrypt +.Lexit_gcm_decrypt: + cmpq $256,%r8 + jbe .Lskip_hkeys_cleanup_983 + vpxor %xmm0,%xmm0,%xmm0 + vmovdqa64 %zmm0,0(%rsp) + vmovdqa64 %zmm0,64(%rsp) + vmovdqa64 %zmm0,128(%rsp) + vmovdqa64 %zmm0,192(%rsp) + vmovdqa64 %zmm0,256(%rsp) + vmovdqa64 %zmm0,320(%rsp) + vmovdqa64 %zmm0,384(%rsp) + vmovdqa64 %zmm0,448(%rsp) + vmovdqa64 %zmm0,512(%rsp) + vmovdqa64 %zmm0,576(%rsp) + vmovdqa64 %zmm0,640(%rsp) + vmovdqa64 %zmm0,704(%rsp) +.Lskip_hkeys_cleanup_983: + vzeroupper + leaq (%rbp),%rsp +.cfi_def_cfa_register %rsp + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx + .byte 0xf3,0xc3 +.Ldecrypt_seh_end: +.cfi_endproc +.size ossl_aes_gcm_decrypt_avx512, .-ossl_aes_gcm_decrypt_avx512 +.globl ossl_aes_gcm_finalize_avx512 +.type ossl_aes_gcm_finalize_avx512,@function +.align 32 +ossl_aes_gcm_finalize_avx512: +.cfi_startproc +.byte 243,15,30,250 + vmovdqu 336(%rdi),%xmm2 + vmovdqu 32(%rdi),%xmm3 + vmovdqu 64(%rdi),%xmm4 + + + cmpq $0,%rsi + je .L_partial_done_984 + + vpclmulqdq $0x11,%xmm2,%xmm4,%xmm0 + vpclmulqdq $0x00,%xmm2,%xmm4,%xmm16 + vpclmulqdq $0x01,%xmm2,%xmm4,%xmm17 + vpclmulqdq $0x10,%xmm2,%xmm4,%xmm4 + vpxorq %xmm17,%xmm4,%xmm4 + + vpsrldq $8,%xmm4,%xmm17 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm17,%xmm0,%xmm0 + vpxorq %xmm16,%xmm4,%xmm4 + + + + vmovdqu64 POLY2(%rip),%xmm17 + + vpclmulqdq $0x01,%xmm4,%xmm17,%xmm16 + vpslldq $8,%xmm16,%xmm16 + vpxorq %xmm16,%xmm4,%xmm4 + + + + vpclmulqdq $0x00,%xmm4,%xmm17,%xmm16 + vpsrldq $4,%xmm16,%xmm16 + vpclmulqdq $0x10,%xmm4,%xmm17,%xmm4 + vpslldq $4,%xmm4,%xmm4 + + vpternlogq $0x96,%xmm16,%xmm0,%xmm4 + +.L_partial_done_984: + vmovq 56(%rdi),%xmm5 + vpinsrq $1,48(%rdi),%xmm5,%xmm5 + vpsllq $3,%xmm5,%xmm5 + + vpxor %xmm5,%xmm4,%xmm4 + + vpclmulqdq $0x11,%xmm2,%xmm4,%xmm0 + vpclmulqdq $0x00,%xmm2,%xmm4,%xmm16 + vpclmulqdq $0x01,%xmm2,%xmm4,%xmm17 + vpclmulqdq $0x10,%xmm2,%xmm4,%xmm4 + vpxorq %xmm17,%xmm4,%xmm4 + + vpsrldq $8,%xmm4,%xmm17 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm17,%xmm0,%xmm0 + vpxorq %xmm16,%xmm4,%xmm4 + + + + vmovdqu64 POLY2(%rip),%xmm17 + + vpclmulqdq $0x01,%xmm4,%xmm17,%xmm16 + vpslldq $8,%xmm16,%xmm16 + vpxorq %xmm16,%xmm4,%xmm4 + + + + vpclmulqdq $0x00,%xmm4,%xmm17,%xmm16 + vpsrldq $4,%xmm16,%xmm16 + vpclmulqdq $0x10,%xmm4,%xmm17,%xmm4 + vpslldq $4,%xmm4,%xmm4 + + vpternlogq $0x96,%xmm16,%xmm0,%xmm4 + + vpshufb SHUF_MASK(%rip),%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + +.L_return_T_984: + vmovdqu %xmm3,64(%rdi) +.Labort_finalize: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_aes_gcm_finalize_avx512, .-ossl_aes_gcm_finalize_avx512 +.globl ossl_gcm_gmult_avx512 +.hidden ossl_gcm_gmult_avx512 +.type ossl_gcm_gmult_avx512,@function +.align 32 +ossl_gcm_gmult_avx512: +.cfi_startproc +.byte 243,15,30,250 + vmovdqu64 (%rdi),%xmm1 + vmovdqu64 336(%rsi),%xmm2 + + vpclmulqdq $0x11,%xmm2,%xmm1,%xmm3 + vpclmulqdq $0x00,%xmm2,%xmm1,%xmm4 + vpclmulqdq $0x01,%xmm2,%xmm1,%xmm5 + vpclmulqdq $0x10,%xmm2,%xmm1,%xmm1 + vpxorq %xmm5,%xmm1,%xmm1 + + vpsrldq $8,%xmm1,%xmm5 + vpslldq $8,%xmm1,%xmm1 + vpxorq %xmm5,%xmm3,%xmm3 + vpxorq %xmm4,%xmm1,%xmm1 + + + + vmovdqu64 POLY2(%rip),%xmm5 + + vpclmulqdq $0x01,%xmm1,%xmm5,%xmm4 + vpslldq $8,%xmm4,%xmm4 + vpxorq %xmm4,%xmm1,%xmm1 + + + + vpclmulqdq $0x00,%xmm1,%xmm5,%xmm4 + vpsrldq $4,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm1,%xmm5,%xmm1 + vpslldq $4,%xmm1,%xmm1 + + vpternlogq $0x96,%xmm4,%xmm3,%xmm1 + + vmovdqu64 %xmm1,(%rdi) + vzeroupper +.Labort_gmult: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_gcm_gmult_avx512, .-ossl_gcm_gmult_avx512 +.section .rodata +.align 16 +POLY:.quad 0x0000000000000001, 0xC200000000000000 + +.align 64 +POLY2: +.quad 0x00000001C2000000, 0xC200000000000000 +.quad 0x00000001C2000000, 0xC200000000000000 +.quad 0x00000001C2000000, 0xC200000000000000 +.quad 0x00000001C2000000, 0xC200000000000000 + +.align 16 +TWOONE:.quad 0x0000000000000001, 0x0000000100000000 + + + +.align 64 +SHUF_MASK: +.quad 0x08090A0B0C0D0E0F, 0x0001020304050607 +.quad 0x08090A0B0C0D0E0F, 0x0001020304050607 +.quad 0x08090A0B0C0D0E0F, 0x0001020304050607 +.quad 0x08090A0B0C0D0E0F, 0x0001020304050607 + +.align 16 +SHIFT_MASK: +.quad 0x0706050403020100, 0x0f0e0d0c0b0a0908 + +ALL_F: +.quad 0xffffffffffffffff, 0xffffffffffffffff + +ZERO: +.quad 0x0000000000000000, 0x0000000000000000 + +.align 16 +ONE: +.quad 0x0000000000000001, 0x0000000000000000 + +.align 16 +ONEf: +.quad 0x0000000000000000, 0x0100000000000000 + +.align 64 +ddq_add_1234: +.quad 0x0000000000000001, 0x0000000000000000 +.quad 0x0000000000000002, 0x0000000000000000 +.quad 0x0000000000000003, 0x0000000000000000 +.quad 0x0000000000000004, 0x0000000000000000 + +.align 64 +ddq_add_5678: +.quad 0x0000000000000005, 0x0000000000000000 +.quad 0x0000000000000006, 0x0000000000000000 +.quad 0x0000000000000007, 0x0000000000000000 +.quad 0x0000000000000008, 0x0000000000000000 + +.align 64 +ddq_add_4444: +.quad 0x0000000000000004, 0x0000000000000000 +.quad 0x0000000000000004, 0x0000000000000000 +.quad 0x0000000000000004, 0x0000000000000000 +.quad 0x0000000000000004, 0x0000000000000000 + +.align 64 +ddq_add_8888: +.quad 0x0000000000000008, 0x0000000000000000 +.quad 0x0000000000000008, 0x0000000000000000 +.quad 0x0000000000000008, 0x0000000000000000 +.quad 0x0000000000000008, 0x0000000000000000 + +.align 64 +ddq_addbe_1234: +.quad 0x0000000000000000, 0x0100000000000000 +.quad 0x0000000000000000, 0x0200000000000000 +.quad 0x0000000000000000, 0x0300000000000000 +.quad 0x0000000000000000, 0x0400000000000000 + +.align 64 +ddq_addbe_4444: +.quad 0x0000000000000000, 0x0400000000000000 +.quad 0x0000000000000000, 0x0400000000000000 +.quad 0x0000000000000000, 0x0400000000000000 +.quad 0x0000000000000000, 0x0400000000000000 + +.align 64 +byte_len_to_mask_table: +.value 0x0000, 0x0001, 0x0003, 0x0007 +.value 0x000f, 0x001f, 0x003f, 0x007f +.value 0x00ff, 0x01ff, 0x03ff, 0x07ff +.value 0x0fff, 0x1fff, 0x3fff, 0x7fff +.value 0xffff + +.align 64 +byte64_len_to_mask_table: +.quad 0x0000000000000000, 0x0000000000000001 +.quad 0x0000000000000003, 0x0000000000000007 +.quad 0x000000000000000f, 0x000000000000001f +.quad 0x000000000000003f, 0x000000000000007f +.quad 0x00000000000000ff, 0x00000000000001ff +.quad 0x00000000000003ff, 0x00000000000007ff +.quad 0x0000000000000fff, 0x0000000000001fff +.quad 0x0000000000003fff, 0x0000000000007fff +.quad 0x000000000000ffff, 0x000000000001ffff +.quad 0x000000000003ffff, 0x000000000007ffff +.quad 0x00000000000fffff, 0x00000000001fffff +.quad 0x00000000003fffff, 0x00000000007fffff +.quad 0x0000000000ffffff, 0x0000000001ffffff +.quad 0x0000000003ffffff, 0x0000000007ffffff +.quad 0x000000000fffffff, 0x000000001fffffff +.quad 0x000000003fffffff, 0x000000007fffffff +.quad 0x00000000ffffffff, 0x00000001ffffffff +.quad 0x00000003ffffffff, 0x00000007ffffffff +.quad 0x0000000fffffffff, 0x0000001fffffffff +.quad 0x0000003fffffffff, 0x0000007fffffffff +.quad 0x000000ffffffffff, 0x000001ffffffffff +.quad 0x000003ffffffffff, 0x000007ffffffffff +.quad 0x00000fffffffffff, 0x00001fffffffffff +.quad 0x00003fffffffffff, 0x00007fffffffffff +.quad 0x0000ffffffffffff, 0x0001ffffffffffff +.quad 0x0003ffffffffffff, 0x0007ffffffffffff +.quad 0x000fffffffffffff, 0x001fffffffffffff +.quad 0x003fffffffffffff, 0x007fffffffffffff +.quad 0x00ffffffffffffff, 0x01ffffffffffffff +.quad 0x03ffffffffffffff, 0x07ffffffffffffff +.quad 0x0fffffffffffffff, 0x1fffffffffffffff +.quad 0x3fffffffffffffff, 0x7fffffffffffffff +.quad 0xffffffffffffffff + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/aesni-gcm-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/aesni-gcm-x86_64.s index 288f44af921f..4250ede2b7c3 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/aesni-gcm-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/aesni-gcm-x86_64.s @@ -774,6 +774,7 @@ aesni_gcm_encrypt: .byte 0xf3,0xc3 .cfi_endproc .size aesni_gcm_encrypt,.-aesni_gcm_encrypt +.section .rodata .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 @@ -786,6 +787,7 @@ aesni_gcm_encrypt: .Lone_lsb: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.previous .align 64 .section ".note.gnu.property", "a" .p2align 3 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/ghash-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/ghash-x86_64.s index ac4823fe5892..8073c88018a1 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/ghash-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/ghash-x86_64.s @@ -708,6 +708,7 @@ gcm_ghash_4bit: .align 16 gcm_init_clmul: .cfi_startproc +.byte 243,15,30,250 .L_init_clmul: movdqu (%rsi),%xmm2 pshufd $78,%xmm2,%xmm2 @@ -1306,6 +1307,7 @@ gcm_ghash_clmul: .align 32 gcm_init_avx: .cfi_startproc +.byte 243,15,30,250 vzeroupper vmovdqu (%rsi),%xmm2 @@ -1798,6 +1800,7 @@ gcm_ghash_avx: .byte 0xf3,0xc3 .cfi_endproc .size gcm_ghash_avx,.-gcm_ghash_avx +.section .rodata .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 @@ -1851,6 +1854,7 @@ gcm_ghash_avx: .byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 +.previous .section ".note.gnu.property", "a" .p2align 3 .long 1f - 0f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/keccak1600-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/keccak1600-x86_64.s index 9c0054aa1754..38397fb46079 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/keccak1600-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/keccak1600-x86_64.s @@ -447,10 +447,12 @@ SHA3_squeeze: .cfi_offset %r14,-32 shrq $3,%rcx - movq %rdi,%r8 + movq %rdi,%r9 movq %rsi,%r12 movq %rdx,%r13 movq %rcx,%r14 + btl $0,%r8d + jc .Lnext_block jmp .Loop_squeeze .align 32 @@ -458,8 +460,8 @@ SHA3_squeeze: cmpq $8,%r13 jb .Ltail_squeeze - movq (%r8),%rax - leaq 8(%r8),%r8 + movq (%r9),%rax + leaq 8(%r9),%r9 movq %rax,(%r12) leaq 8(%r12),%r12 subq $8,%r13 @@ -467,14 +469,14 @@ SHA3_squeeze: subq $1,%rcx jnz .Loop_squeeze - +.Lnext_block: call KeccakF1600 - movq %rdi,%r8 + movq %rdi,%r9 movq %r14,%rcx jmp .Loop_squeeze .Ltail_squeeze: - movq %r8,%rsi + movq %r9,%rsi movq %r12,%rdi movq %r13,%rcx .byte 0xf3,0xa4 @@ -492,6 +494,7 @@ SHA3_squeeze: .byte 0xf3,0xc3 .cfi_endproc .size SHA3_squeeze,.-SHA3_squeeze +.section .rodata .align 256 .quad 0,0,0,0,0,0,0,0 .type iotas,@object diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-mb-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-mb-x86_64.s index 589ffb37468e..ea33cad5e813 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-mb-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-mb-x86_64.s @@ -7286,7 +7286,7 @@ _avx2_shortcut: .byte 0xf3,0xc3 .cfi_endproc .size sha1_multi_block_avx2,.-sha1_multi_block_avx2 - +.section .rodata .align 256 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 @@ -7301,6 +7301,7 @@ K_XX_XX: .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .byte 83,72,65,49,32,109,117,108,116,105,45,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.previous .section ".note.gnu.property", "a" .p2align 3 .long 1f - 0f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-x86_64.s index 3a03212f8b6b..f52bb1bbe677 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-x86_64.s @@ -5433,6 +5433,7 @@ _avx2_shortcut: .byte 0xf3,0xc3 .cfi_endproc .size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2 +.section .rodata .align 64 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 @@ -5446,6 +5447,7 @@ K_XX_XX: .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 +.previous .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 .section ".note.gnu.property", "a" diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-mb-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-mb-x86_64.s index 8f9e4bfe5cf0..34f57dcc0fc3 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-mb-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-mb-x86_64.s @@ -7831,6 +7831,7 @@ _avx2_shortcut: .byte 0xf3,0xc3 .cfi_endproc .size sha256_multi_block_avx2,.-sha256_multi_block_avx2 +.section .rodata .align 256 K256: .long 1116352408,1116352408,1116352408,1116352408 @@ -7982,6 +7983,7 @@ K256_shaext: .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .byte 83,72,65,50,53,54,32,109,117,108,116,105,45,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.previous .section ".note.gnu.property", "a" .p2align 3 .long 1f - 0f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-x86_64.s index 1b03ce39b999..593e5e1d45e7 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-x86_64.s @@ -1728,6 +1728,7 @@ sha256_block_data_order: .byte 0xf3,0xc3 .cfi_endproc .size sha256_block_data_order,.-sha256_block_data_order +.section .rodata .align 64 .type K256,@object K256: @@ -1771,6 +1772,7 @@ K256: .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.previous .type sha256_block_data_order_shaext,@function .align 64 sha256_block_data_order_shaext: diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha512-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha512-x86_64.s index 3744b830146d..5b50ad72701d 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha512-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha512-x86_64.s @@ -1726,6 +1726,7 @@ sha512_block_data_order: .byte 0xf3,0xc3 .cfi_endproc .size sha512_block_data_order,.-sha512_block_data_order +.section .rodata .align 64 .type K512,@object K512: @@ -1813,6 +1814,7 @@ K512: .quad 0x0001020304050607,0x08090a0b0c0d0e0f .quad 0x0001020304050607,0x08090a0b0c0d0e0f .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.previous .type sha512_block_data_order_xop,@function .align 64 sha512_block_data_order_xop: diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aes-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aes-x86_64.nasm index 5884b5bb2dc1..5f32da6b11ba 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aes-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aes-x86_64.nasm @@ -1915,6 +1915,7 @@ $L$cbc_epilogue: DB 0F3h,0C3h ;repret $L$SEH_end_AES_cbc_encrypt: +section .rdata rdata align=64 ALIGN 64 $L$AES_Te: DD 0xa56363c6,0xa56363c6 @@ -2704,6 +2705,7 @@ DB 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 DB 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 DB 62,0 ALIGN 64 +section .text EXTERN __imp_RtlVirtualUnwind ALIGN 16 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha1-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha1-x86_64.nasm index f4ed3f708433..b25ef3f18837 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha1-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha1-x86_64.nasm @@ -2781,6 +2781,7 @@ $L$epilogue_avx: DB 0F3h,0C3h ;repret $L$SEH_end_aesni_cbc_sha1_enc_avx: +section .rdata rdata align=64 ALIGN 64 K_XX_XX: DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999 @@ -2796,6 +2797,7 @@ DB 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 DB 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 DB 114,103,62,0 ALIGN 64 +section .text ALIGN 32 aesni_cbc_sha1_enc_shaext: diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha256-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha256-x86_64.nasm index b2a9c65f5d08..a30d38b5bf32 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha256-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha256-x86_64.nasm @@ -39,6 +39,7 @@ $L$probe: +section .rdata rdata align=64 ALIGN 64 K256: @@ -85,6 +86,7 @@ DB 54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98 DB 121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108 DB 46,111,114,103,62,0 ALIGN 64 +section .text ALIGN 64 aesni_cbc_sha256_enc_xop: diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-x86_64.nasm index 75a9780a38ad..55fb980c93be 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-x86_64.nasm @@ -4762,6 +4762,7 @@ $L$key_expansion_256b: +section .rdata rdata align=64 ALIGN 64 $L$bswap_mask: DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 @@ -4787,6 +4788,7 @@ DB 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 DB 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 DB 115,108,46,111,114,103,62,0 ALIGN 64 +section .text EXTERN __imp_RtlVirtualUnwind ALIGN 16 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/bsaes-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/bsaes-x86_64.nasm index 3ef944cab2da..dab6c115b70c 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/bsaes-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/bsaes-x86_64.nasm @@ -1615,6 +1615,7 @@ global ossl_bsaes_xts_encrypt ALIGN 16 ossl_bsaes_xts_encrypt: +DB 243,15,30,250 mov rax,rsp $L$xts_enc_prologue: push rbp @@ -2110,6 +2111,7 @@ global ossl_bsaes_xts_decrypt ALIGN 16 ossl_bsaes_xts_decrypt: +DB 243,15,30,250 mov rax,rsp $L$xts_dec_prologue: push rbp @@ -2625,6 +2627,7 @@ $L$xts_dec_epilogue: +section .rdata rdata align=64 ALIGN 64 _bsaes_const: $L$M0ISR: @@ -2676,13 +2679,13 @@ $L$M0: DQ 0x02060a0e03070b0f,0x0004080c0105090d $L$63: DQ 0x6363636363636363,0x6363636363636363 +ALIGN 64 + DB 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102 DB 111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44 DB 32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44 DB 32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32 DB 65,110,100,121,32,80,111,108,121,97,107,111,118,0 -ALIGN 64 - EXTERN __imp_RtlVirtualUnwind ALIGN 16 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/vpaes-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/vpaes-x86_64.nasm index 74f87a0f8723..decb571de900 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/vpaes-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/vpaes-x86_64.nasm @@ -941,6 +941,7 @@ _vpaes_preheat: +section .rdata rdata align=64 ALIGN 64 _vpaes_consts: $L$k_inv: @@ -1036,13 +1037,13 @@ $L$k_dsbe: $L$k_dsbo: DQ 0x1387EA537EF94000,0xC7AA6DB9D4943E2D DQ 0x12D7560F93441D00,0xCA4B8159D8C58E9C +ALIGN 64 + DB 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 DB 111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54 DB 52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97 DB 109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32 DB 85,110,105,118,101,114,115,105,116,121,41,0 -ALIGN 64 - EXTERN __imp_RtlVirtualUnwind ALIGN 16 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-2k-avx512.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-2k-avx512.nasm new file mode 100644 index 000000000000..ed5b2944ed3c --- /dev/null +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-2k-avx512.nasm @@ -0,0 +1,1024 @@ +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +EXTERN OPENSSL_ia32cap_P +global ossl_rsaz_avx512ifma_eligible + +ALIGN 32 +ossl_rsaz_avx512ifma_eligible: + mov ecx,DWORD[((OPENSSL_ia32cap_P+8))] + xor eax,eax + and ecx,2149777408 + cmp ecx,2149777408 + cmove eax,ecx + DB 0F3h,0C3h ;repret + +section .text code align=64 + + +global ossl_rsaz_amm52x20_x1_ifma256 + +ALIGN 32 +ossl_rsaz_amm52x20_x1_ifma256: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ossl_rsaz_amm52x20_x1_ifma256: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +DB 243,15,30,250 + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ossl_rsaz_amm52x20_x1_ifma256_body: + + + vpxord ymm0,ymm0,ymm0 + vmovdqa64 ymm3,ymm0 + vmovdqa64 ymm16,ymm0 + vmovdqa64 ymm17,ymm0 + vmovdqa64 ymm18,ymm0 + vmovdqa64 ymm19,ymm0 + + xor r9d,r9d + + mov r11,rdx + mov rax,0xfffffffffffff + + + mov ebx,5 + +ALIGN 32 +$L$loop5: + mov r13,QWORD[r11] + + vpbroadcastq ymm1,r13 + mov rdx,QWORD[rsi] + mulx r12,r13,r13 + add r9,r13 + mov r10,r12 + adc r10,0 + + mov r13,r8 + imul r13,r9 + and r13,rax + + vpbroadcastq ymm2,r13 + mov rdx,QWORD[rcx] + mulx r12,r13,r13 + add r9,r13 + adc r10,r12 + + shr r9,52 + sal r10,12 + or r9,r10 + + vpmadd52luq ymm3,ymm1,YMMWORD[rsi] + vpmadd52luq ymm16,ymm1,YMMWORD[32+rsi] + vpmadd52luq ymm17,ymm1,YMMWORD[64+rsi] + vpmadd52luq ymm18,ymm1,YMMWORD[96+rsi] + vpmadd52luq ymm19,ymm1,YMMWORD[128+rsi] + + vpmadd52luq ymm3,ymm2,YMMWORD[rcx] + vpmadd52luq ymm16,ymm2,YMMWORD[32+rcx] + vpmadd52luq ymm17,ymm2,YMMWORD[64+rcx] + vpmadd52luq ymm18,ymm2,YMMWORD[96+rcx] + vpmadd52luq ymm19,ymm2,YMMWORD[128+rcx] + + + valignq ymm3,ymm16,ymm3,1 + valignq ymm16,ymm17,ymm16,1 + valignq ymm17,ymm18,ymm17,1 + valignq ymm18,ymm19,ymm18,1 + valignq ymm19,ymm0,ymm19,1 + + vmovq r13,xmm3 + add r9,r13 + + vpmadd52huq ymm3,ymm1,YMMWORD[rsi] + vpmadd52huq ymm16,ymm1,YMMWORD[32+rsi] + vpmadd52huq ymm17,ymm1,YMMWORD[64+rsi] + vpmadd52huq ymm18,ymm1,YMMWORD[96+rsi] + vpmadd52huq ymm19,ymm1,YMMWORD[128+rsi] + + vpmadd52huq ymm3,ymm2,YMMWORD[rcx] + vpmadd52huq ymm16,ymm2,YMMWORD[32+rcx] + vpmadd52huq ymm17,ymm2,YMMWORD[64+rcx] + vpmadd52huq ymm18,ymm2,YMMWORD[96+rcx] + vpmadd52huq ymm19,ymm2,YMMWORD[128+rcx] + mov r13,QWORD[8+r11] + + vpbroadcastq ymm1,r13 + mov rdx,QWORD[rsi] + mulx r12,r13,r13 + add r9,r13 + mov r10,r12 + adc r10,0 + + mov r13,r8 + imul r13,r9 + and r13,rax + + vpbroadcastq ymm2,r13 + mov rdx,QWORD[rcx] + mulx r12,r13,r13 + add r9,r13 + adc r10,r12 + + shr r9,52 + sal r10,12 + or r9,r10 + + vpmadd52luq ymm3,ymm1,YMMWORD[rsi] + vpmadd52luq ymm16,ymm1,YMMWORD[32+rsi] + vpmadd52luq ymm17,ymm1,YMMWORD[64+rsi] + vpmadd52luq ymm18,ymm1,YMMWORD[96+rsi] + vpmadd52luq ymm19,ymm1,YMMWORD[128+rsi] + + vpmadd52luq ymm3,ymm2,YMMWORD[rcx] + vpmadd52luq ymm16,ymm2,YMMWORD[32+rcx] + vpmadd52luq ymm17,ymm2,YMMWORD[64+rcx] + vpmadd52luq ymm18,ymm2,YMMWORD[96+rcx] + vpmadd52luq ymm19,ymm2,YMMWORD[128+rcx] + + + valignq ymm3,ymm16,ymm3,1 + valignq ymm16,ymm17,ymm16,1 + valignq ymm17,ymm18,ymm17,1 + valignq ymm18,ymm19,ymm18,1 + valignq ymm19,ymm0,ymm19,1 + + vmovq r13,xmm3 + add r9,r13 + + vpmadd52huq ymm3,ymm1,YMMWORD[rsi] + vpmadd52huq ymm16,ymm1,YMMWORD[32+rsi] + vpmadd52huq ymm17,ymm1,YMMWORD[64+rsi] + vpmadd52huq ymm18,ymm1,YMMWORD[96+rsi] + vpmadd52huq ymm19,ymm1,YMMWORD[128+rsi] + + vpmadd52huq ymm3,ymm2,YMMWORD[rcx] + vpmadd52huq ymm16,ymm2,YMMWORD[32+rcx] + vpmadd52huq ymm17,ymm2,YMMWORD[64+rcx] + vpmadd52huq ymm18,ymm2,YMMWORD[96+rcx] + vpmadd52huq ymm19,ymm2,YMMWORD[128+rcx] + mov r13,QWORD[16+r11] + + vpbroadcastq ymm1,r13 + mov rdx,QWORD[rsi] + mulx r12,r13,r13 + add r9,r13 + mov r10,r12 + adc r10,0 + + mov r13,r8 + imul r13,r9 + and r13,rax + + vpbroadcastq ymm2,r13 + mov rdx,QWORD[rcx] + mulx r12,r13,r13 + add r9,r13 + adc r10,r12 + + shr r9,52 + sal r10,12 + or r9,r10 + + vpmadd52luq ymm3,ymm1,YMMWORD[rsi] + vpmadd52luq ymm16,ymm1,YMMWORD[32+rsi] + vpmadd52luq ymm17,ymm1,YMMWORD[64+rsi] + vpmadd52luq ymm18,ymm1,YMMWORD[96+rsi] + vpmadd52luq ymm19,ymm1,YMMWORD[128+rsi] + + vpmadd52luq ymm3,ymm2,YMMWORD[rcx] + vpmadd52luq ymm16,ymm2,YMMWORD[32+rcx] + vpmadd52luq ymm17,ymm2,YMMWORD[64+rcx] + vpmadd52luq ymm18,ymm2,YMMWORD[96+rcx] + vpmadd52luq ymm19,ymm2,YMMWORD[128+rcx] + + + valignq ymm3,ymm16,ymm3,1 + valignq ymm16,ymm17,ymm16,1 + valignq ymm17,ymm18,ymm17,1 + valignq ymm18,ymm19,ymm18,1 + valignq ymm19,ymm0,ymm19,1 + + vmovq r13,xmm3 + add r9,r13 + + vpmadd52huq ymm3,ymm1,YMMWORD[rsi] + vpmadd52huq ymm16,ymm1,YMMWORD[32+rsi] + vpmadd52huq ymm17,ymm1,YMMWORD[64+rsi] + vpmadd52huq ymm18,ymm1,YMMWORD[96+rsi] + vpmadd52huq ymm19,ymm1,YMMWORD[128+rsi] + + vpmadd52huq ymm3,ymm2,YMMWORD[rcx] + vpmadd52huq ymm16,ymm2,YMMWORD[32+rcx] + vpmadd52huq ymm17,ymm2,YMMWORD[64+rcx] + vpmadd52huq ymm18,ymm2,YMMWORD[96+rcx] + vpmadd52huq ymm19,ymm2,YMMWORD[128+rcx] + mov r13,QWORD[24+r11] + + vpbroadcastq ymm1,r13 + mov rdx,QWORD[rsi] + mulx r12,r13,r13 + add r9,r13 + mov r10,r12 + adc r10,0 + + mov r13,r8 + imul r13,r9 + and r13,rax + + vpbroadcastq ymm2,r13 + mov rdx,QWORD[rcx] + mulx r12,r13,r13 + add r9,r13 + adc r10,r12 + + shr r9,52 + sal r10,12 + or r9,r10 + + vpmadd52luq ymm3,ymm1,YMMWORD[rsi] + vpmadd52luq ymm16,ymm1,YMMWORD[32+rsi] + vpmadd52luq ymm17,ymm1,YMMWORD[64+rsi] + vpmadd52luq ymm18,ymm1,YMMWORD[96+rsi] + vpmadd52luq ymm19,ymm1,YMMWORD[128+rsi] + + vpmadd52luq ymm3,ymm2,YMMWORD[rcx] + vpmadd52luq ymm16,ymm2,YMMWORD[32+rcx] + vpmadd52luq ymm17,ymm2,YMMWORD[64+rcx] + vpmadd52luq ymm18,ymm2,YMMWORD[96+rcx] + vpmadd52luq ymm19,ymm2,YMMWORD[128+rcx] + + + valignq ymm3,ymm16,ymm3,1 + valignq ymm16,ymm17,ymm16,1 + valignq ymm17,ymm18,ymm17,1 + valignq ymm18,ymm19,ymm18,1 + valignq ymm19,ymm0,ymm19,1 + + vmovq r13,xmm3 + add r9,r13 + + vpmadd52huq ymm3,ymm1,YMMWORD[rsi] + vpmadd52huq ymm16,ymm1,YMMWORD[32+rsi] + vpmadd52huq ymm17,ymm1,YMMWORD[64+rsi] + vpmadd52huq ymm18,ymm1,YMMWORD[96+rsi] + vpmadd52huq ymm19,ymm1,YMMWORD[128+rsi] + + vpmadd52huq ymm3,ymm2,YMMWORD[rcx] + vpmadd52huq ymm16,ymm2,YMMWORD[32+rcx] + vpmadd52huq ymm17,ymm2,YMMWORD[64+rcx] + vpmadd52huq ymm18,ymm2,YMMWORD[96+rcx] + vpmadd52huq ymm19,ymm2,YMMWORD[128+rcx] + lea r11,[32+r11] + dec ebx + jne NEAR $L$loop5 + + vpbroadcastq ymm0,r9 + vpblendd ymm3,ymm3,ymm0,3 + + + + vpsrlq ymm0,ymm3,52 + vpsrlq ymm1,ymm16,52 + vpsrlq ymm2,ymm17,52 + vpsrlq ymm25,ymm18,52 + vpsrlq ymm26,ymm19,52 + + + valignq ymm26,ymm26,ymm25,3 + valignq ymm25,ymm25,ymm2,3 + valignq ymm2,ymm2,ymm1,3 + valignq ymm1,ymm1,ymm0,3 + valignq ymm0,ymm0,YMMWORD[$L$zeros],3 + + + vpandq ymm3,ymm3,YMMWORD[$L$mask52x4] + vpandq ymm16,ymm16,YMMWORD[$L$mask52x4] + vpandq ymm17,ymm17,YMMWORD[$L$mask52x4] + vpandq ymm18,ymm18,YMMWORD[$L$mask52x4] + vpandq ymm19,ymm19,YMMWORD[$L$mask52x4] + + + vpaddq ymm3,ymm3,ymm0 + vpaddq ymm16,ymm16,ymm1 + vpaddq ymm17,ymm17,ymm2 + vpaddq ymm18,ymm18,ymm25 + vpaddq ymm19,ymm19,ymm26 + + + + vpcmpuq k1,ymm3,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm16,YMMWORD[$L$mask52x4],6 + vpcmpuq k3,ymm17,YMMWORD[$L$mask52x4],6 + vpcmpuq k4,ymm18,YMMWORD[$L$mask52x4],6 + vpcmpuq k5,ymm19,YMMWORD[$L$mask52x4],6 + kmovb r14d,k1 + kmovb r13d,k2 + kmovb r12d,k3 + kmovb r11d,k4 + kmovb r10d,k5 + + + vpcmpuq k1,ymm3,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm16,YMMWORD[$L$mask52x4],0 + vpcmpuq k3,ymm17,YMMWORD[$L$mask52x4],0 + vpcmpuq k4,ymm18,YMMWORD[$L$mask52x4],0 + vpcmpuq k5,ymm19,YMMWORD[$L$mask52x4],0 + kmovb r9d,k1 + kmovb r8d,k2 + kmovb ebx,k3 + kmovb ecx,k4 + kmovb edx,k5 + + + + shl r13b,4 + or r14b,r13b + shl r11b,4 + or r12b,r11b + + add r14b,r14b + adc r12b,r12b + adc r10b,r10b + + shl r8b,4 + or r9b,r8b + shl cl,4 + or bl,cl + + add r14b,r9b + adc r12b,bl + adc r10b,dl + + xor r14b,r9b + xor r12b,bl + xor r10b,dl + + kmovb k1,r14d + shr r14b,4 + kmovb k2,r14d + kmovb k3,r12d + shr r12b,4 + kmovb k4,r12d + kmovb k5,r10d + + + vpsubq ymm3{k1},ymm3,YMMWORD[$L$mask52x4] + vpsubq ymm16{k2},ymm16,YMMWORD[$L$mask52x4] + vpsubq ymm17{k3},ymm17,YMMWORD[$L$mask52x4] + vpsubq ymm18{k4},ymm18,YMMWORD[$L$mask52x4] + vpsubq ymm19{k5},ymm19,YMMWORD[$L$mask52x4] + + vpandq ymm3,ymm3,YMMWORD[$L$mask52x4] + vpandq ymm16,ymm16,YMMWORD[$L$mask52x4] + vpandq ymm17,ymm17,YMMWORD[$L$mask52x4] + vpandq ymm18,ymm18,YMMWORD[$L$mask52x4] + vpandq ymm19,ymm19,YMMWORD[$L$mask52x4] + + vmovdqu64 YMMWORD[rdi],ymm3 + vmovdqu64 YMMWORD[32+rdi],ymm16 + vmovdqu64 YMMWORD[64+rdi],ymm17 + vmovdqu64 YMMWORD[96+rdi],ymm18 + vmovdqu64 YMMWORD[128+rdi],ymm19 + + vzeroupper + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbp,QWORD[32+rsp] + + mov rbx,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ossl_rsaz_amm52x20_x1_ifma256_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ossl_rsaz_amm52x20_x1_ifma256: +section .rdata rdata align=32 +ALIGN 32 +$L$mask52x4: + DQ 0xfffffffffffff + DQ 0xfffffffffffff + DQ 0xfffffffffffff + DQ 0xfffffffffffff +section .text code align=64 + + +global ossl_rsaz_amm52x20_x2_ifma256 + +ALIGN 32 +ossl_rsaz_amm52x20_x2_ifma256: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ossl_rsaz_amm52x20_x2_ifma256: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +DB 243,15,30,250 + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ossl_rsaz_amm52x20_x2_ifma256_body: + + + vpxord ymm0,ymm0,ymm0 + vmovdqa64 ymm3,ymm0 + vmovdqa64 ymm16,ymm0 + vmovdqa64 ymm17,ymm0 + vmovdqa64 ymm18,ymm0 + vmovdqa64 ymm19,ymm0 + vmovdqa64 ymm4,ymm0 + vmovdqa64 ymm20,ymm0 + vmovdqa64 ymm21,ymm0 + vmovdqa64 ymm22,ymm0 + vmovdqa64 ymm23,ymm0 + + xor r9d,r9d + xor r15d,r15d + + mov r11,rdx + mov rax,0xfffffffffffff + + mov ebx,20 + +ALIGN 32 +$L$loop20: + mov r13,QWORD[r11] + + vpbroadcastq ymm1,r13 + mov rdx,QWORD[rsi] + mulx r12,r13,r13 + add r9,r13 + mov r10,r12 + adc r10,0 + + mov r13,QWORD[r8] + imul r13,r9 + and r13,rax + + vpbroadcastq ymm2,r13 + mov rdx,QWORD[rcx] + mulx r12,r13,r13 + add r9,r13 + adc r10,r12 + + shr r9,52 + sal r10,12 + or r9,r10 + + vpmadd52luq ymm3,ymm1,YMMWORD[rsi] + vpmadd52luq ymm16,ymm1,YMMWORD[32+rsi] + vpmadd52luq ymm17,ymm1,YMMWORD[64+rsi] + vpmadd52luq ymm18,ymm1,YMMWORD[96+rsi] + vpmadd52luq ymm19,ymm1,YMMWORD[128+rsi] + + vpmadd52luq ymm3,ymm2,YMMWORD[rcx] + vpmadd52luq ymm16,ymm2,YMMWORD[32+rcx] + vpmadd52luq ymm17,ymm2,YMMWORD[64+rcx] + vpmadd52luq ymm18,ymm2,YMMWORD[96+rcx] + vpmadd52luq ymm19,ymm2,YMMWORD[128+rcx] + + + valignq ymm3,ymm16,ymm3,1 + valignq ymm16,ymm17,ymm16,1 + valignq ymm17,ymm18,ymm17,1 + valignq ymm18,ymm19,ymm18,1 + valignq ymm19,ymm0,ymm19,1 + + vmovq r13,xmm3 + add r9,r13 + + vpmadd52huq ymm3,ymm1,YMMWORD[rsi] + vpmadd52huq ymm16,ymm1,YMMWORD[32+rsi] + vpmadd52huq ymm17,ymm1,YMMWORD[64+rsi] + vpmadd52huq ymm18,ymm1,YMMWORD[96+rsi] + vpmadd52huq ymm19,ymm1,YMMWORD[128+rsi] + + vpmadd52huq ymm3,ymm2,YMMWORD[rcx] + vpmadd52huq ymm16,ymm2,YMMWORD[32+rcx] + vpmadd52huq ymm17,ymm2,YMMWORD[64+rcx] + vpmadd52huq ymm18,ymm2,YMMWORD[96+rcx] + vpmadd52huq ymm19,ymm2,YMMWORD[128+rcx] + mov r13,QWORD[160+r11] + + vpbroadcastq ymm1,r13 + mov rdx,QWORD[160+rsi] + mulx r12,r13,r13 + add r15,r13 + mov r10,r12 + adc r10,0 + + mov r13,QWORD[8+r8] + imul r13,r15 + and r13,rax + + vpbroadcastq ymm2,r13 + mov rdx,QWORD[160+rcx] + mulx r12,r13,r13 + add r15,r13 + adc r10,r12 + + shr r15,52 + sal r10,12 + or r15,r10 + + vpmadd52luq ymm4,ymm1,YMMWORD[160+rsi] + vpmadd52luq ymm20,ymm1,YMMWORD[192+rsi] + vpmadd52luq ymm21,ymm1,YMMWORD[224+rsi] + vpmadd52luq ymm22,ymm1,YMMWORD[256+rsi] + vpmadd52luq ymm23,ymm1,YMMWORD[288+rsi] + + vpmadd52luq ymm4,ymm2,YMMWORD[160+rcx] + vpmadd52luq ymm20,ymm2,YMMWORD[192+rcx] + vpmadd52luq ymm21,ymm2,YMMWORD[224+rcx] + vpmadd52luq ymm22,ymm2,YMMWORD[256+rcx] + vpmadd52luq ymm23,ymm2,YMMWORD[288+rcx] + + + valignq ymm4,ymm20,ymm4,1 + valignq ymm20,ymm21,ymm20,1 + valignq ymm21,ymm22,ymm21,1 + valignq ymm22,ymm23,ymm22,1 + valignq ymm23,ymm0,ymm23,1 + + vmovq r13,xmm4 + add r15,r13 + + vpmadd52huq ymm4,ymm1,YMMWORD[160+rsi] + vpmadd52huq ymm20,ymm1,YMMWORD[192+rsi] + vpmadd52huq ymm21,ymm1,YMMWORD[224+rsi] + vpmadd52huq ymm22,ymm1,YMMWORD[256+rsi] + vpmadd52huq ymm23,ymm1,YMMWORD[288+rsi] + + vpmadd52huq ymm4,ymm2,YMMWORD[160+rcx] + vpmadd52huq ymm20,ymm2,YMMWORD[192+rcx] + vpmadd52huq ymm21,ymm2,YMMWORD[224+rcx] + vpmadd52huq ymm22,ymm2,YMMWORD[256+rcx] + vpmadd52huq ymm23,ymm2,YMMWORD[288+rcx] + lea r11,[8+r11] + dec ebx + jne NEAR $L$loop20 + + vpbroadcastq ymm0,r9 + vpblendd ymm3,ymm3,ymm0,3 + + + + vpsrlq ymm0,ymm3,52 + vpsrlq ymm1,ymm16,52 + vpsrlq ymm2,ymm17,52 + vpsrlq ymm25,ymm18,52 + vpsrlq ymm26,ymm19,52 + + + valignq ymm26,ymm26,ymm25,3 + valignq ymm25,ymm25,ymm2,3 + valignq ymm2,ymm2,ymm1,3 + valignq ymm1,ymm1,ymm0,3 + valignq ymm0,ymm0,YMMWORD[$L$zeros],3 + + + vpandq ymm3,ymm3,YMMWORD[$L$mask52x4] + vpandq ymm16,ymm16,YMMWORD[$L$mask52x4] + vpandq ymm17,ymm17,YMMWORD[$L$mask52x4] + vpandq ymm18,ymm18,YMMWORD[$L$mask52x4] + vpandq ymm19,ymm19,YMMWORD[$L$mask52x4] + + + vpaddq ymm3,ymm3,ymm0 + vpaddq ymm16,ymm16,ymm1 + vpaddq ymm17,ymm17,ymm2 + vpaddq ymm18,ymm18,ymm25 + vpaddq ymm19,ymm19,ymm26 + + + + vpcmpuq k1,ymm3,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm16,YMMWORD[$L$mask52x4],6 + vpcmpuq k3,ymm17,YMMWORD[$L$mask52x4],6 + vpcmpuq k4,ymm18,YMMWORD[$L$mask52x4],6 + vpcmpuq k5,ymm19,YMMWORD[$L$mask52x4],6 + kmovb r14d,k1 + kmovb r13d,k2 + kmovb r12d,k3 + kmovb r11d,k4 + kmovb r10d,k5 + + + vpcmpuq k1,ymm3,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm16,YMMWORD[$L$mask52x4],0 + vpcmpuq k3,ymm17,YMMWORD[$L$mask52x4],0 + vpcmpuq k4,ymm18,YMMWORD[$L$mask52x4],0 + vpcmpuq k5,ymm19,YMMWORD[$L$mask52x4],0 + kmovb r9d,k1 + kmovb r8d,k2 + kmovb ebx,k3 + kmovb ecx,k4 + kmovb edx,k5 + + + + shl r13b,4 + or r14b,r13b + shl r11b,4 + or r12b,r11b + + add r14b,r14b + adc r12b,r12b + adc r10b,r10b + + shl r8b,4 + or r9b,r8b + shl cl,4 + or bl,cl + + add r14b,r9b + adc r12b,bl + adc r10b,dl + + xor r14b,r9b + xor r12b,bl + xor r10b,dl + + kmovb k1,r14d + shr r14b,4 + kmovb k2,r14d + kmovb k3,r12d + shr r12b,4 + kmovb k4,r12d + kmovb k5,r10d + + + vpsubq ymm3{k1},ymm3,YMMWORD[$L$mask52x4] + vpsubq ymm16{k2},ymm16,YMMWORD[$L$mask52x4] + vpsubq ymm17{k3},ymm17,YMMWORD[$L$mask52x4] + vpsubq ymm18{k4},ymm18,YMMWORD[$L$mask52x4] + vpsubq ymm19{k5},ymm19,YMMWORD[$L$mask52x4] + + vpandq ymm3,ymm3,YMMWORD[$L$mask52x4] + vpandq ymm16,ymm16,YMMWORD[$L$mask52x4] + vpandq ymm17,ymm17,YMMWORD[$L$mask52x4] + vpandq ymm18,ymm18,YMMWORD[$L$mask52x4] + vpandq ymm19,ymm19,YMMWORD[$L$mask52x4] + + vpbroadcastq ymm0,r15 + vpblendd ymm4,ymm4,ymm0,3 + + + + vpsrlq ymm0,ymm4,52 + vpsrlq ymm1,ymm20,52 + vpsrlq ymm2,ymm21,52 + vpsrlq ymm25,ymm22,52 + vpsrlq ymm26,ymm23,52 + + + valignq ymm26,ymm26,ymm25,3 + valignq ymm25,ymm25,ymm2,3 + valignq ymm2,ymm2,ymm1,3 + valignq ymm1,ymm1,ymm0,3 + valignq ymm0,ymm0,YMMWORD[$L$zeros],3 + + + vpandq ymm4,ymm4,YMMWORD[$L$mask52x4] + vpandq ymm20,ymm20,YMMWORD[$L$mask52x4] + vpandq ymm21,ymm21,YMMWORD[$L$mask52x4] + vpandq ymm22,ymm22,YMMWORD[$L$mask52x4] + vpandq ymm23,ymm23,YMMWORD[$L$mask52x4] + + + vpaddq ymm4,ymm4,ymm0 + vpaddq ymm20,ymm20,ymm1 + vpaddq ymm21,ymm21,ymm2 + vpaddq ymm22,ymm22,ymm25 + vpaddq ymm23,ymm23,ymm26 + + + + vpcmpuq k1,ymm4,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm20,YMMWORD[$L$mask52x4],6 + vpcmpuq k3,ymm21,YMMWORD[$L$mask52x4],6 + vpcmpuq k4,ymm22,YMMWORD[$L$mask52x4],6 + vpcmpuq k5,ymm23,YMMWORD[$L$mask52x4],6 + kmovb r14d,k1 + kmovb r13d,k2 + kmovb r12d,k3 + kmovb r11d,k4 + kmovb r10d,k5 + + + vpcmpuq k1,ymm4,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm20,YMMWORD[$L$mask52x4],0 + vpcmpuq k3,ymm21,YMMWORD[$L$mask52x4],0 + vpcmpuq k4,ymm22,YMMWORD[$L$mask52x4],0 + vpcmpuq k5,ymm23,YMMWORD[$L$mask52x4],0 + kmovb r9d,k1 + kmovb r8d,k2 + kmovb ebx,k3 + kmovb ecx,k4 + kmovb edx,k5 + + + + shl r13b,4 + or r14b,r13b + shl r11b,4 + or r12b,r11b + + add r14b,r14b + adc r12b,r12b + adc r10b,r10b + + shl r8b,4 + or r9b,r8b + shl cl,4 + or bl,cl + + add r14b,r9b + adc r12b,bl + adc r10b,dl + + xor r14b,r9b + xor r12b,bl + xor r10b,dl + + kmovb k1,r14d + shr r14b,4 + kmovb k2,r14d + kmovb k3,r12d + shr r12b,4 + kmovb k4,r12d + kmovb k5,r10d + + + vpsubq ymm4{k1},ymm4,YMMWORD[$L$mask52x4] + vpsubq ymm20{k2},ymm20,YMMWORD[$L$mask52x4] + vpsubq ymm21{k3},ymm21,YMMWORD[$L$mask52x4] + vpsubq ymm22{k4},ymm22,YMMWORD[$L$mask52x4] + vpsubq ymm23{k5},ymm23,YMMWORD[$L$mask52x4] + + vpandq ymm4,ymm4,YMMWORD[$L$mask52x4] + vpandq ymm20,ymm20,YMMWORD[$L$mask52x4] + vpandq ymm21,ymm21,YMMWORD[$L$mask52x4] + vpandq ymm22,ymm22,YMMWORD[$L$mask52x4] + vpandq ymm23,ymm23,YMMWORD[$L$mask52x4] + + vmovdqu64 YMMWORD[rdi],ymm3 + vmovdqu64 YMMWORD[32+rdi],ymm16 + vmovdqu64 YMMWORD[64+rdi],ymm17 + vmovdqu64 YMMWORD[96+rdi],ymm18 + vmovdqu64 YMMWORD[128+rdi],ymm19 + + vmovdqu64 YMMWORD[160+rdi],ymm4 + vmovdqu64 YMMWORD[192+rdi],ymm20 + vmovdqu64 YMMWORD[224+rdi],ymm21 + vmovdqu64 YMMWORD[256+rdi],ymm22 + vmovdqu64 YMMWORD[288+rdi],ymm23 + + vzeroupper + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbp,QWORD[32+rsp] + + mov rbx,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ossl_rsaz_amm52x20_x2_ifma256_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ossl_rsaz_amm52x20_x2_ifma256: +section .text code align=64 + + +ALIGN 32 +global ossl_extract_multiplier_2x20_win5 + +ossl_extract_multiplier_2x20_win5: + +DB 243,15,30,250 + vmovdqa64 ymm24,YMMWORD[$L$ones] + vpbroadcastq ymm22,r8 + vpbroadcastq ymm23,r9 + lea rax,[10240+rdx] + + + vpxor xmm0,xmm0,xmm0 + vmovdqa64 ymm21,ymm0 + vmovdqa64 ymm1,ymm0 + vmovdqa64 ymm2,ymm0 + vmovdqa64 ymm3,ymm0 + vmovdqa64 ymm4,ymm0 + vmovdqa64 ymm5,ymm0 + vmovdqa64 ymm16,ymm0 + vmovdqa64 ymm17,ymm0 + vmovdqa64 ymm18,ymm0 + vmovdqa64 ymm19,ymm0 + +ALIGN 32 +$L$loop: + vpcmpq k1,ymm22,ymm21,0 + vpcmpq k2,ymm23,ymm21,0 + vmovdqu64 ymm20,YMMWORD[rdx] + vpblendmq ymm0{k1},ymm0,ymm20 + vmovdqu64 ymm20,YMMWORD[32+rdx] + vpblendmq ymm1{k1},ymm1,ymm20 + vmovdqu64 ymm20,YMMWORD[64+rdx] + vpblendmq ymm2{k1},ymm2,ymm20 + vmovdqu64 ymm20,YMMWORD[96+rdx] + vpblendmq ymm3{k1},ymm3,ymm20 + vmovdqu64 ymm20,YMMWORD[128+rdx] + vpblendmq ymm4{k1},ymm4,ymm20 + vmovdqu64 ymm20,YMMWORD[160+rdx] + vpblendmq ymm5{k2},ymm5,ymm20 + vmovdqu64 ymm20,YMMWORD[192+rdx] + vpblendmq ymm16{k2},ymm16,ymm20 + vmovdqu64 ymm20,YMMWORD[224+rdx] + vpblendmq ymm17{k2},ymm17,ymm20 + vmovdqu64 ymm20,YMMWORD[256+rdx] + vpblendmq ymm18{k2},ymm18,ymm20 + vmovdqu64 ymm20,YMMWORD[288+rdx] + vpblendmq ymm19{k2},ymm19,ymm20 + vpaddq ymm21,ymm21,ymm24 + add rdx,320 + cmp rax,rdx + jne NEAR $L$loop + vmovdqu64 YMMWORD[rcx],ymm0 + vmovdqu64 YMMWORD[32+rcx],ymm1 + vmovdqu64 YMMWORD[64+rcx],ymm2 + vmovdqu64 YMMWORD[96+rcx],ymm3 + vmovdqu64 YMMWORD[128+rcx],ymm4 + vmovdqu64 YMMWORD[160+rcx],ymm5 + vmovdqu64 YMMWORD[192+rcx],ymm16 + vmovdqu64 YMMWORD[224+rcx],ymm17 + vmovdqu64 YMMWORD[256+rcx],ymm18 + vmovdqu64 YMMWORD[288+rcx],ymm19 + DB 0F3h,0C3h ;repret + + +section .rdata rdata align=32 +ALIGN 32 +$L$ones: + DQ 1,1,1,1 +$L$zeros: + DQ 0,0,0,0 +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +rsaz_def_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rax,[48+rax] + + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_ossl_rsaz_amm52x20_x1_ifma256 wrt ..imagebase + DD $L$SEH_end_ossl_rsaz_amm52x20_x1_ifma256 wrt ..imagebase + DD $L$SEH_info_ossl_rsaz_amm52x20_x1_ifma256 wrt ..imagebase + + DD $L$SEH_begin_ossl_rsaz_amm52x20_x2_ifma256 wrt ..imagebase + DD $L$SEH_end_ossl_rsaz_amm52x20_x2_ifma256 wrt ..imagebase + DD $L$SEH_info_ossl_rsaz_amm52x20_x2_ifma256 wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_ossl_rsaz_amm52x20_x1_ifma256: +DB 9,0,0,0 + DD rsaz_def_handler wrt ..imagebase + DD $L$ossl_rsaz_amm52x20_x1_ifma256_body wrt ..imagebase,$L$ossl_rsaz_amm52x20_x1_ifma256_epilogue wrt ..imagebase +$L$SEH_info_ossl_rsaz_amm52x20_x2_ifma256: +DB 9,0,0,0 + DD rsaz_def_handler wrt ..imagebase + DD $L$ossl_rsaz_amm52x20_x2_ifma256_body wrt ..imagebase,$L$ossl_rsaz_amm52x20_x2_ifma256_epilogue wrt ..imagebase diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-3k-avx512.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-3k-avx512.nasm new file mode 100644 index 000000000000..c6e91b66c9a0 --- /dev/null +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-3k-avx512.nasm @@ -0,0 +1,1490 @@ +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + + +global ossl_rsaz_amm52x30_x1_ifma256 + +ALIGN 32 +ossl_rsaz_amm52x30_x1_ifma256: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ossl_rsaz_amm52x30_x1_ifma256: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +DB 243,15,30,250 + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,[((-168))+rsp] + vmovdqa64 XMMWORD[rsp],xmm6 + vmovdqa64 XMMWORD[16+rsp],xmm7 + vmovdqa64 XMMWORD[32+rsp],xmm8 + vmovdqa64 XMMWORD[48+rsp],xmm9 + vmovdqa64 XMMWORD[64+rsp],xmm10 + vmovdqa64 XMMWORD[80+rsp],xmm11 + vmovdqa64 XMMWORD[96+rsp],xmm12 + vmovdqa64 XMMWORD[112+rsp],xmm13 + vmovdqa64 XMMWORD[128+rsp],xmm14 + vmovdqa64 XMMWORD[144+rsp],xmm15 +$L$ossl_rsaz_amm52x30_x1_ifma256_body: + + vpxord ymm0,ymm0,ymm0 + vmovdqa64 ymm3,ymm0 + vmovdqa64 ymm4,ymm0 + vmovdqa64 ymm5,ymm0 + vmovdqa64 ymm6,ymm0 + vmovdqa64 ymm7,ymm0 + vmovdqa64 ymm8,ymm0 + vmovdqa64 ymm9,ymm0 + vmovdqa64 ymm10,ymm0 + + xor r9d,r9d + + mov r11,rdx + mov rax,0xfffffffffffff + + + mov ebx,7 + +ALIGN 32 +$L$loop7: + mov r13,QWORD[r11] + + vpbroadcastq ymm1,r13 + mov rdx,QWORD[rsi] + mulx r12,r13,r13 + add r9,r13 + mov r10,r12 + adc r10,0 + + mov r13,r8 + imul r13,r9 + and r13,rax + + vpbroadcastq ymm2,r13 + mov rdx,QWORD[rcx] + mulx r12,r13,r13 + add r9,r13 + adc r10,r12 + + shr r9,52 + sal r10,12 + or r9,r10 + + vpmadd52luq ymm3,ymm1,YMMWORD[rsi] + vpmadd52luq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52luq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52luq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52luq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52luq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52luq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52luq ymm10,ymm1,YMMWORD[224+rsi] + + vpmadd52luq ymm3,ymm2,YMMWORD[rcx] + vpmadd52luq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52luq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52luq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52luq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52luq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52luq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52luq ymm10,ymm2,YMMWORD[224+rcx] + + + valignq ymm3,ymm4,ymm3,1 + valignq ymm4,ymm5,ymm4,1 + valignq ymm5,ymm6,ymm5,1 + valignq ymm6,ymm7,ymm6,1 + valignq ymm7,ymm8,ymm7,1 + valignq ymm8,ymm9,ymm8,1 + valignq ymm9,ymm10,ymm9,1 + valignq ymm10,ymm0,ymm10,1 + + vmovq r13,xmm3 + add r9,r13 + + vpmadd52huq ymm3,ymm1,YMMWORD[rsi] + vpmadd52huq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52huq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52huq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52huq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52huq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52huq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52huq ymm10,ymm1,YMMWORD[224+rsi] + + vpmadd52huq ymm3,ymm2,YMMWORD[rcx] + vpmadd52huq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52huq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52huq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52huq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52huq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52huq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52huq ymm10,ymm2,YMMWORD[224+rcx] + mov r13,QWORD[8+r11] + + vpbroadcastq ymm1,r13 + mov rdx,QWORD[rsi] + mulx r12,r13,r13 + add r9,r13 + mov r10,r12 + adc r10,0 + + mov r13,r8 + imul r13,r9 + and r13,rax + + vpbroadcastq ymm2,r13 + mov rdx,QWORD[rcx] + mulx r12,r13,r13 + add r9,r13 + adc r10,r12 + + shr r9,52 + sal r10,12 + or r9,r10 + + vpmadd52luq ymm3,ymm1,YMMWORD[rsi] + vpmadd52luq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52luq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52luq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52luq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52luq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52luq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52luq ymm10,ymm1,YMMWORD[224+rsi] + + vpmadd52luq ymm3,ymm2,YMMWORD[rcx] + vpmadd52luq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52luq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52luq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52luq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52luq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52luq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52luq ymm10,ymm2,YMMWORD[224+rcx] + + + valignq ymm3,ymm4,ymm3,1 + valignq ymm4,ymm5,ymm4,1 + valignq ymm5,ymm6,ymm5,1 + valignq ymm6,ymm7,ymm6,1 + valignq ymm7,ymm8,ymm7,1 + valignq ymm8,ymm9,ymm8,1 + valignq ymm9,ymm10,ymm9,1 + valignq ymm10,ymm0,ymm10,1 + + vmovq r13,xmm3 + add r9,r13 + + vpmadd52huq ymm3,ymm1,YMMWORD[rsi] + vpmadd52huq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52huq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52huq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52huq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52huq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52huq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52huq ymm10,ymm1,YMMWORD[224+rsi] + + vpmadd52huq ymm3,ymm2,YMMWORD[rcx] + vpmadd52huq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52huq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52huq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52huq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52huq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52huq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52huq ymm10,ymm2,YMMWORD[224+rcx] + mov r13,QWORD[16+r11] + + vpbroadcastq ymm1,r13 + mov rdx,QWORD[rsi] + mulx r12,r13,r13 + add r9,r13 + mov r10,r12 + adc r10,0 + + mov r13,r8 + imul r13,r9 + and r13,rax + + vpbroadcastq ymm2,r13 + mov rdx,QWORD[rcx] + mulx r12,r13,r13 + add r9,r13 + adc r10,r12 + + shr r9,52 + sal r10,12 + or r9,r10 + + vpmadd52luq ymm3,ymm1,YMMWORD[rsi] + vpmadd52luq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52luq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52luq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52luq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52luq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52luq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52luq ymm10,ymm1,YMMWORD[224+rsi] + + vpmadd52luq ymm3,ymm2,YMMWORD[rcx] + vpmadd52luq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52luq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52luq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52luq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52luq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52luq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52luq ymm10,ymm2,YMMWORD[224+rcx] + + + valignq ymm3,ymm4,ymm3,1 + valignq ymm4,ymm5,ymm4,1 + valignq ymm5,ymm6,ymm5,1 + valignq ymm6,ymm7,ymm6,1 + valignq ymm7,ymm8,ymm7,1 + valignq ymm8,ymm9,ymm8,1 + valignq ymm9,ymm10,ymm9,1 + valignq ymm10,ymm0,ymm10,1 + + vmovq r13,xmm3 + add r9,r13 + + vpmadd52huq ymm3,ymm1,YMMWORD[rsi] + vpmadd52huq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52huq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52huq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52huq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52huq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52huq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52huq ymm10,ymm1,YMMWORD[224+rsi] + + vpmadd52huq ymm3,ymm2,YMMWORD[rcx] + vpmadd52huq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52huq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52huq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52huq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52huq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52huq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52huq ymm10,ymm2,YMMWORD[224+rcx] + mov r13,QWORD[24+r11] + + vpbroadcastq ymm1,r13 + mov rdx,QWORD[rsi] + mulx r12,r13,r13 + add r9,r13 + mov r10,r12 + adc r10,0 + + mov r13,r8 + imul r13,r9 + and r13,rax + + vpbroadcastq ymm2,r13 + mov rdx,QWORD[rcx] + mulx r12,r13,r13 + add r9,r13 + adc r10,r12 + + shr r9,52 + sal r10,12 + or r9,r10 + + vpmadd52luq ymm3,ymm1,YMMWORD[rsi] + vpmadd52luq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52luq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52luq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52luq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52luq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52luq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52luq ymm10,ymm1,YMMWORD[224+rsi] + + vpmadd52luq ymm3,ymm2,YMMWORD[rcx] + vpmadd52luq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52luq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52luq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52luq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52luq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52luq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52luq ymm10,ymm2,YMMWORD[224+rcx] + + + valignq ymm3,ymm4,ymm3,1 + valignq ymm4,ymm5,ymm4,1 + valignq ymm5,ymm6,ymm5,1 + valignq ymm6,ymm7,ymm6,1 + valignq ymm7,ymm8,ymm7,1 + valignq ymm8,ymm9,ymm8,1 + valignq ymm9,ymm10,ymm9,1 + valignq ymm10,ymm0,ymm10,1 + + vmovq r13,xmm3 + add r9,r13 + + vpmadd52huq ymm3,ymm1,YMMWORD[rsi] + vpmadd52huq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52huq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52huq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52huq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52huq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52huq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52huq ymm10,ymm1,YMMWORD[224+rsi] + + vpmadd52huq ymm3,ymm2,YMMWORD[rcx] + vpmadd52huq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52huq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52huq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52huq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52huq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52huq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52huq ymm10,ymm2,YMMWORD[224+rcx] + lea r11,[32+r11] + dec ebx + jne NEAR $L$loop7 + mov r13,QWORD[r11] + + vpbroadcastq ymm1,r13 + mov rdx,QWORD[rsi] + mulx r12,r13,r13 + add r9,r13 + mov r10,r12 + adc r10,0 + + mov r13,r8 + imul r13,r9 + and r13,rax + + vpbroadcastq ymm2,r13 + mov rdx,QWORD[rcx] + mulx r12,r13,r13 + add r9,r13 + adc r10,r12 + + shr r9,52 + sal r10,12 + or r9,r10 + + vpmadd52luq ymm3,ymm1,YMMWORD[rsi] + vpmadd52luq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52luq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52luq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52luq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52luq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52luq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52luq ymm10,ymm1,YMMWORD[224+rsi] + + vpmadd52luq ymm3,ymm2,YMMWORD[rcx] + vpmadd52luq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52luq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52luq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52luq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52luq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52luq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52luq ymm10,ymm2,YMMWORD[224+rcx] + + + valignq ymm3,ymm4,ymm3,1 + valignq ymm4,ymm5,ymm4,1 + valignq ymm5,ymm6,ymm5,1 + valignq ymm6,ymm7,ymm6,1 + valignq ymm7,ymm8,ymm7,1 + valignq ymm8,ymm9,ymm8,1 + valignq ymm9,ymm10,ymm9,1 + valignq ymm10,ymm0,ymm10,1 + + vmovq r13,xmm3 + add r9,r13 + + vpmadd52huq ymm3,ymm1,YMMWORD[rsi] + vpmadd52huq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52huq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52huq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52huq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52huq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52huq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52huq ymm10,ymm1,YMMWORD[224+rsi] + + vpmadd52huq ymm3,ymm2,YMMWORD[rcx] + vpmadd52huq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52huq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52huq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52huq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52huq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52huq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52huq ymm10,ymm2,YMMWORD[224+rcx] + mov r13,QWORD[8+r11] + + vpbroadcastq ymm1,r13 + mov rdx,QWORD[rsi] + mulx r12,r13,r13 + add r9,r13 + mov r10,r12 + adc r10,0 + + mov r13,r8 + imul r13,r9 + and r13,rax + + vpbroadcastq ymm2,r13 + mov rdx,QWORD[rcx] + mulx r12,r13,r13 + add r9,r13 + adc r10,r12 + + shr r9,52 + sal r10,12 + or r9,r10 + + vpmadd52luq ymm3,ymm1,YMMWORD[rsi] + vpmadd52luq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52luq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52luq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52luq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52luq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52luq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52luq ymm10,ymm1,YMMWORD[224+rsi] + + vpmadd52luq ymm3,ymm2,YMMWORD[rcx] + vpmadd52luq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52luq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52luq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52luq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52luq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52luq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52luq ymm10,ymm2,YMMWORD[224+rcx] + + + valignq ymm3,ymm4,ymm3,1 + valignq ymm4,ymm5,ymm4,1 + valignq ymm5,ymm6,ymm5,1 + valignq ymm6,ymm7,ymm6,1 + valignq ymm7,ymm8,ymm7,1 + valignq ymm8,ymm9,ymm8,1 + valignq ymm9,ymm10,ymm9,1 + valignq ymm10,ymm0,ymm10,1 + + vmovq r13,xmm3 + add r9,r13 + + vpmadd52huq ymm3,ymm1,YMMWORD[rsi] + vpmadd52huq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52huq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52huq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52huq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52huq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52huq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52huq ymm10,ymm1,YMMWORD[224+rsi] + + vpmadd52huq ymm3,ymm2,YMMWORD[rcx] + vpmadd52huq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52huq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52huq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52huq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52huq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52huq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52huq ymm10,ymm2,YMMWORD[224+rcx] + + vpbroadcastq ymm0,r9 + vpblendd ymm3,ymm3,ymm0,3 + + + + vpsrlq ymm0,ymm3,52 + vpsrlq ymm1,ymm4,52 + vpsrlq ymm2,ymm5,52 + vpsrlq ymm19,ymm6,52 + vpsrlq ymm20,ymm7,52 + vpsrlq ymm21,ymm8,52 + vpsrlq ymm22,ymm9,52 + vpsrlq ymm23,ymm10,52 + + + valignq ymm23,ymm23,ymm22,3 + valignq ymm22,ymm22,ymm21,3 + valignq ymm21,ymm21,ymm20,3 + valignq ymm20,ymm20,ymm19,3 + valignq ymm19,ymm19,ymm2,3 + valignq ymm2,ymm2,ymm1,3 + valignq ymm1,ymm1,ymm0,3 + valignq ymm0,ymm0,YMMWORD[$L$zeros],3 + + + vpandq ymm3,ymm3,YMMWORD[$L$mask52x4] + vpandq ymm4,ymm4,YMMWORD[$L$mask52x4] + vpandq ymm5,ymm5,YMMWORD[$L$mask52x4] + vpandq ymm6,ymm6,YMMWORD[$L$mask52x4] + vpandq ymm7,ymm7,YMMWORD[$L$mask52x4] + vpandq ymm8,ymm8,YMMWORD[$L$mask52x4] + vpandq ymm9,ymm9,YMMWORD[$L$mask52x4] + vpandq ymm10,ymm10,YMMWORD[$L$mask52x4] + + + vpaddq ymm3,ymm3,ymm0 + vpaddq ymm4,ymm4,ymm1 + vpaddq ymm5,ymm5,ymm2 + vpaddq ymm6,ymm6,ymm19 + vpaddq ymm7,ymm7,ymm20 + vpaddq ymm8,ymm8,ymm21 + vpaddq ymm9,ymm9,ymm22 + vpaddq ymm10,ymm10,ymm23 + + + + vpcmpuq k1,ymm3,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm4,YMMWORD[$L$mask52x4],6 + kmovb r14d,k1 + kmovb r13d,k2 + shl r13b,4 + or r14b,r13b + + vpcmpuq k1,ymm5,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm6,YMMWORD[$L$mask52x4],6 + kmovb r13d,k1 + kmovb r12d,k2 + shl r12b,4 + or r13b,r12b + + vpcmpuq k1,ymm7,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm8,YMMWORD[$L$mask52x4],6 + kmovb r12d,k1 + kmovb r11d,k2 + shl r11b,4 + or r12b,r11b + + vpcmpuq k1,ymm9,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm10,YMMWORD[$L$mask52x4],6 + kmovb r11d,k1 + kmovb r10d,k2 + shl r10b,4 + or r11b,r10b + + add r14b,r14b + adc r13b,r13b + adc r12b,r12b + adc r11b,r11b + + + vpcmpuq k1,ymm3,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm4,YMMWORD[$L$mask52x4],0 + kmovb r9d,k1 + kmovb r8d,k2 + shl r8b,4 + or r9b,r8b + + vpcmpuq k1,ymm5,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm6,YMMWORD[$L$mask52x4],0 + kmovb r8d,k1 + kmovb edx,k2 + shl dl,4 + or r8b,dl + + vpcmpuq k1,ymm7,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm8,YMMWORD[$L$mask52x4],0 + kmovb edx,k1 + kmovb ecx,k2 + shl cl,4 + or dl,cl + + vpcmpuq k1,ymm9,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm10,YMMWORD[$L$mask52x4],0 + kmovb ecx,k1 + kmovb ebx,k2 + shl bl,4 + or cl,bl + + add r14b,r9b + adc r13b,r8b + adc r12b,dl + adc r11b,cl + + xor r14b,r9b + xor r13b,r8b + xor r12b,dl + xor r11b,cl + + kmovb k1,r14d + shr r14b,4 + kmovb k2,r14d + kmovb k3,r13d + shr r13b,4 + kmovb k4,r13d + kmovb k5,r12d + shr r12b,4 + kmovb k6,r12d + kmovb k7,r11d + + vpsubq ymm3{k1},ymm3,YMMWORD[$L$mask52x4] + vpsubq ymm4{k2},ymm4,YMMWORD[$L$mask52x4] + vpsubq ymm5{k3},ymm5,YMMWORD[$L$mask52x4] + vpsubq ymm6{k4},ymm6,YMMWORD[$L$mask52x4] + vpsubq ymm7{k5},ymm7,YMMWORD[$L$mask52x4] + vpsubq ymm8{k6},ymm8,YMMWORD[$L$mask52x4] + vpsubq ymm9{k7},ymm9,YMMWORD[$L$mask52x4] + + vpandq ymm3,ymm3,YMMWORD[$L$mask52x4] + vpandq ymm4,ymm4,YMMWORD[$L$mask52x4] + vpandq ymm5,ymm5,YMMWORD[$L$mask52x4] + vpandq ymm6,ymm6,YMMWORD[$L$mask52x4] + vpandq ymm7,ymm7,YMMWORD[$L$mask52x4] + vpandq ymm8,ymm8,YMMWORD[$L$mask52x4] + vpandq ymm9,ymm9,YMMWORD[$L$mask52x4] + + shr r11b,4 + kmovb k1,r11d + + vpsubq ymm10{k1},ymm10,YMMWORD[$L$mask52x4] + + vpandq ymm10,ymm10,YMMWORD[$L$mask52x4] + + vmovdqu64 YMMWORD[rdi],ymm3 + vmovdqu64 YMMWORD[32+rdi],ymm4 + vmovdqu64 YMMWORD[64+rdi],ymm5 + vmovdqu64 YMMWORD[96+rdi],ymm6 + vmovdqu64 YMMWORD[128+rdi],ymm7 + vmovdqu64 YMMWORD[160+rdi],ymm8 + vmovdqu64 YMMWORD[192+rdi],ymm9 + vmovdqu64 YMMWORD[224+rdi],ymm10 + + vzeroupper + lea rax,[rsp] + + vmovdqa64 xmm6,XMMWORD[rax] + vmovdqa64 xmm7,XMMWORD[16+rax] + vmovdqa64 xmm8,XMMWORD[32+rax] + vmovdqa64 xmm9,XMMWORD[48+rax] + vmovdqa64 xmm10,XMMWORD[64+rax] + vmovdqa64 xmm11,XMMWORD[80+rax] + vmovdqa64 xmm12,XMMWORD[96+rax] + vmovdqa64 xmm13,XMMWORD[112+rax] + vmovdqa64 xmm14,XMMWORD[128+rax] + vmovdqa64 xmm15,XMMWORD[144+rax] + lea rax,[168+rsp] + mov r15,QWORD[rax] + + mov r14,QWORD[8+rax] + + mov r13,QWORD[16+rax] + + mov r12,QWORD[24+rax] + + mov rbp,QWORD[32+rax] + + mov rbx,QWORD[40+rax] + + lea rsp,[48+rax] + +$L$ossl_rsaz_amm52x30_x1_ifma256_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ossl_rsaz_amm52x30_x1_ifma256: +section .rdata rdata align=32 +ALIGN 32 +$L$mask52x4: + DQ 0xfffffffffffff + DQ 0xfffffffffffff + DQ 0xfffffffffffff + DQ 0xfffffffffffff +section .text code align=64 + + +global ossl_rsaz_amm52x30_x2_ifma256 + +ALIGN 32 +ossl_rsaz_amm52x30_x2_ifma256: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ossl_rsaz_amm52x30_x2_ifma256: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +DB 243,15,30,250 + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,[((-168))+rsp] + vmovdqa64 XMMWORD[rsp],xmm6 + vmovdqa64 XMMWORD[16+rsp],xmm7 + vmovdqa64 XMMWORD[32+rsp],xmm8 + vmovdqa64 XMMWORD[48+rsp],xmm9 + vmovdqa64 XMMWORD[64+rsp],xmm10 + vmovdqa64 XMMWORD[80+rsp],xmm11 + vmovdqa64 XMMWORD[96+rsp],xmm12 + vmovdqa64 XMMWORD[112+rsp],xmm13 + vmovdqa64 XMMWORD[128+rsp],xmm14 + vmovdqa64 XMMWORD[144+rsp],xmm15 +$L$ossl_rsaz_amm52x30_x2_ifma256_body: + + vpxord ymm0,ymm0,ymm0 + vmovdqa64 ymm3,ymm0 + vmovdqa64 ymm4,ymm0 + vmovdqa64 ymm5,ymm0 + vmovdqa64 ymm6,ymm0 + vmovdqa64 ymm7,ymm0 + vmovdqa64 ymm8,ymm0 + vmovdqa64 ymm9,ymm0 + vmovdqa64 ymm10,ymm0 + + vmovdqa64 ymm11,ymm0 + vmovdqa64 ymm12,ymm0 + vmovdqa64 ymm13,ymm0 + vmovdqa64 ymm14,ymm0 + vmovdqa64 ymm15,ymm0 + vmovdqa64 ymm16,ymm0 + vmovdqa64 ymm17,ymm0 + vmovdqa64 ymm18,ymm0 + + + xor r9d,r9d + xor r15d,r15d + + mov r11,rdx + mov rax,0xfffffffffffff + + mov ebx,30 + +ALIGN 32 +$L$loop30: + mov r13,QWORD[r11] + + vpbroadcastq ymm1,r13 + mov rdx,QWORD[rsi] + mulx r12,r13,r13 + add r9,r13 + mov r10,r12 + adc r10,0 + + mov r13,QWORD[r8] + imul r13,r9 + and r13,rax + + vpbroadcastq ymm2,r13 + mov rdx,QWORD[rcx] + mulx r12,r13,r13 + add r9,r13 + adc r10,r12 + + shr r9,52 + sal r10,12 + or r9,r10 + + vpmadd52luq ymm3,ymm1,YMMWORD[rsi] + vpmadd52luq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52luq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52luq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52luq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52luq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52luq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52luq ymm10,ymm1,YMMWORD[224+rsi] + + vpmadd52luq ymm3,ymm2,YMMWORD[rcx] + vpmadd52luq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52luq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52luq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52luq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52luq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52luq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52luq ymm10,ymm2,YMMWORD[224+rcx] + + + valignq ymm3,ymm4,ymm3,1 + valignq ymm4,ymm5,ymm4,1 + valignq ymm5,ymm6,ymm5,1 + valignq ymm6,ymm7,ymm6,1 + valignq ymm7,ymm8,ymm7,1 + valignq ymm8,ymm9,ymm8,1 + valignq ymm9,ymm10,ymm9,1 + valignq ymm10,ymm0,ymm10,1 + + vmovq r13,xmm3 + add r9,r13 + + vpmadd52huq ymm3,ymm1,YMMWORD[rsi] + vpmadd52huq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52huq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52huq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52huq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52huq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52huq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52huq ymm10,ymm1,YMMWORD[224+rsi] + + vpmadd52huq ymm3,ymm2,YMMWORD[rcx] + vpmadd52huq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52huq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52huq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52huq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52huq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52huq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52huq ymm10,ymm2,YMMWORD[224+rcx] + mov r13,QWORD[256+r11] + + vpbroadcastq ymm1,r13 + mov rdx,QWORD[256+rsi] + mulx r12,r13,r13 + add r15,r13 + mov r10,r12 + adc r10,0 + + mov r13,QWORD[8+r8] + imul r13,r15 + and r13,rax + + vpbroadcastq ymm2,r13 + mov rdx,QWORD[256+rcx] + mulx r12,r13,r13 + add r15,r13 + adc r10,r12 + + shr r15,52 + sal r10,12 + or r15,r10 + + vpmadd52luq ymm11,ymm1,YMMWORD[256+rsi] + vpmadd52luq ymm12,ymm1,YMMWORD[288+rsi] + vpmadd52luq ymm13,ymm1,YMMWORD[320+rsi] + vpmadd52luq ymm14,ymm1,YMMWORD[352+rsi] + vpmadd52luq ymm15,ymm1,YMMWORD[384+rsi] + vpmadd52luq ymm16,ymm1,YMMWORD[416+rsi] + vpmadd52luq ymm17,ymm1,YMMWORD[448+rsi] + vpmadd52luq ymm18,ymm1,YMMWORD[480+rsi] + + vpmadd52luq ymm11,ymm2,YMMWORD[256+rcx] + vpmadd52luq ymm12,ymm2,YMMWORD[288+rcx] + vpmadd52luq ymm13,ymm2,YMMWORD[320+rcx] + vpmadd52luq ymm14,ymm2,YMMWORD[352+rcx] + vpmadd52luq ymm15,ymm2,YMMWORD[384+rcx] + vpmadd52luq ymm16,ymm2,YMMWORD[416+rcx] + vpmadd52luq ymm17,ymm2,YMMWORD[448+rcx] + vpmadd52luq ymm18,ymm2,YMMWORD[480+rcx] + + + valignq ymm11,ymm12,ymm11,1 + valignq ymm12,ymm13,ymm12,1 + valignq ymm13,ymm14,ymm13,1 + valignq ymm14,ymm15,ymm14,1 + valignq ymm15,ymm16,ymm15,1 + valignq ymm16,ymm17,ymm16,1 + valignq ymm17,ymm18,ymm17,1 + valignq ymm18,ymm0,ymm18,1 + + vmovq r13,xmm11 + add r15,r13 + + vpmadd52huq ymm11,ymm1,YMMWORD[256+rsi] + vpmadd52huq ymm12,ymm1,YMMWORD[288+rsi] + vpmadd52huq ymm13,ymm1,YMMWORD[320+rsi] + vpmadd52huq ymm14,ymm1,YMMWORD[352+rsi] + vpmadd52huq ymm15,ymm1,YMMWORD[384+rsi] + vpmadd52huq ymm16,ymm1,YMMWORD[416+rsi] + vpmadd52huq ymm17,ymm1,YMMWORD[448+rsi] + vpmadd52huq ymm18,ymm1,YMMWORD[480+rsi] + + vpmadd52huq ymm11,ymm2,YMMWORD[256+rcx] + vpmadd52huq ymm12,ymm2,YMMWORD[288+rcx] + vpmadd52huq ymm13,ymm2,YMMWORD[320+rcx] + vpmadd52huq ymm14,ymm2,YMMWORD[352+rcx] + vpmadd52huq ymm15,ymm2,YMMWORD[384+rcx] + vpmadd52huq ymm16,ymm2,YMMWORD[416+rcx] + vpmadd52huq ymm17,ymm2,YMMWORD[448+rcx] + vpmadd52huq ymm18,ymm2,YMMWORD[480+rcx] + lea r11,[8+r11] + dec ebx + jne NEAR $L$loop30 + + vpbroadcastq ymm0,r9 + vpblendd ymm3,ymm3,ymm0,3 + + + + vpsrlq ymm0,ymm3,52 + vpsrlq ymm1,ymm4,52 + vpsrlq ymm2,ymm5,52 + vpsrlq ymm19,ymm6,52 + vpsrlq ymm20,ymm7,52 + vpsrlq ymm21,ymm8,52 + vpsrlq ymm22,ymm9,52 + vpsrlq ymm23,ymm10,52 + + + valignq ymm23,ymm23,ymm22,3 + valignq ymm22,ymm22,ymm21,3 + valignq ymm21,ymm21,ymm20,3 + valignq ymm20,ymm20,ymm19,3 + valignq ymm19,ymm19,ymm2,3 + valignq ymm2,ymm2,ymm1,3 + valignq ymm1,ymm1,ymm0,3 + valignq ymm0,ymm0,YMMWORD[$L$zeros],3 + + + vpandq ymm3,ymm3,YMMWORD[$L$mask52x4] + vpandq ymm4,ymm4,YMMWORD[$L$mask52x4] + vpandq ymm5,ymm5,YMMWORD[$L$mask52x4] + vpandq ymm6,ymm6,YMMWORD[$L$mask52x4] + vpandq ymm7,ymm7,YMMWORD[$L$mask52x4] + vpandq ymm8,ymm8,YMMWORD[$L$mask52x4] + vpandq ymm9,ymm9,YMMWORD[$L$mask52x4] + vpandq ymm10,ymm10,YMMWORD[$L$mask52x4] + + + vpaddq ymm3,ymm3,ymm0 + vpaddq ymm4,ymm4,ymm1 + vpaddq ymm5,ymm5,ymm2 + vpaddq ymm6,ymm6,ymm19 + vpaddq ymm7,ymm7,ymm20 + vpaddq ymm8,ymm8,ymm21 + vpaddq ymm9,ymm9,ymm22 + vpaddq ymm10,ymm10,ymm23 + + + + vpcmpuq k1,ymm3,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm4,YMMWORD[$L$mask52x4],6 + kmovb r14d,k1 + kmovb r13d,k2 + shl r13b,4 + or r14b,r13b + + vpcmpuq k1,ymm5,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm6,YMMWORD[$L$mask52x4],6 + kmovb r13d,k1 + kmovb r12d,k2 + shl r12b,4 + or r13b,r12b + + vpcmpuq k1,ymm7,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm8,YMMWORD[$L$mask52x4],6 + kmovb r12d,k1 + kmovb r11d,k2 + shl r11b,4 + or r12b,r11b + + vpcmpuq k1,ymm9,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm10,YMMWORD[$L$mask52x4],6 + kmovb r11d,k1 + kmovb r10d,k2 + shl r10b,4 + or r11b,r10b + + add r14b,r14b + adc r13b,r13b + adc r12b,r12b + adc r11b,r11b + + + vpcmpuq k1,ymm3,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm4,YMMWORD[$L$mask52x4],0 + kmovb r9d,k1 + kmovb r8d,k2 + shl r8b,4 + or r9b,r8b + + vpcmpuq k1,ymm5,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm6,YMMWORD[$L$mask52x4],0 + kmovb r8d,k1 + kmovb edx,k2 + shl dl,4 + or r8b,dl + + vpcmpuq k1,ymm7,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm8,YMMWORD[$L$mask52x4],0 + kmovb edx,k1 + kmovb ecx,k2 + shl cl,4 + or dl,cl + + vpcmpuq k1,ymm9,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm10,YMMWORD[$L$mask52x4],0 + kmovb ecx,k1 + kmovb ebx,k2 + shl bl,4 + or cl,bl + + add r14b,r9b + adc r13b,r8b + adc r12b,dl + adc r11b,cl + + xor r14b,r9b + xor r13b,r8b + xor r12b,dl + xor r11b,cl + + kmovb k1,r14d + shr r14b,4 + kmovb k2,r14d + kmovb k3,r13d + shr r13b,4 + kmovb k4,r13d + kmovb k5,r12d + shr r12b,4 + kmovb k6,r12d + kmovb k7,r11d + + vpsubq ymm3{k1},ymm3,YMMWORD[$L$mask52x4] + vpsubq ymm4{k2},ymm4,YMMWORD[$L$mask52x4] + vpsubq ymm5{k3},ymm5,YMMWORD[$L$mask52x4] + vpsubq ymm6{k4},ymm6,YMMWORD[$L$mask52x4] + vpsubq ymm7{k5},ymm7,YMMWORD[$L$mask52x4] + vpsubq ymm8{k6},ymm8,YMMWORD[$L$mask52x4] + vpsubq ymm9{k7},ymm9,YMMWORD[$L$mask52x4] + + vpandq ymm3,ymm3,YMMWORD[$L$mask52x4] + vpandq ymm4,ymm4,YMMWORD[$L$mask52x4] + vpandq ymm5,ymm5,YMMWORD[$L$mask52x4] + vpandq ymm6,ymm6,YMMWORD[$L$mask52x4] + vpandq ymm7,ymm7,YMMWORD[$L$mask52x4] + vpandq ymm8,ymm8,YMMWORD[$L$mask52x4] + vpandq ymm9,ymm9,YMMWORD[$L$mask52x4] + + shr r11b,4 + kmovb k1,r11d + + vpsubq ymm10{k1},ymm10,YMMWORD[$L$mask52x4] + + vpandq ymm10,ymm10,YMMWORD[$L$mask52x4] + + vpbroadcastq ymm0,r15 + vpblendd ymm11,ymm11,ymm0,3 + + + + vpsrlq ymm0,ymm11,52 + vpsrlq ymm1,ymm12,52 + vpsrlq ymm2,ymm13,52 + vpsrlq ymm19,ymm14,52 + vpsrlq ymm20,ymm15,52 + vpsrlq ymm21,ymm16,52 + vpsrlq ymm22,ymm17,52 + vpsrlq ymm23,ymm18,52 + + + valignq ymm23,ymm23,ymm22,3 + valignq ymm22,ymm22,ymm21,3 + valignq ymm21,ymm21,ymm20,3 + valignq ymm20,ymm20,ymm19,3 + valignq ymm19,ymm19,ymm2,3 + valignq ymm2,ymm2,ymm1,3 + valignq ymm1,ymm1,ymm0,3 + valignq ymm0,ymm0,YMMWORD[$L$zeros],3 + + + vpandq ymm11,ymm11,YMMWORD[$L$mask52x4] + vpandq ymm12,ymm12,YMMWORD[$L$mask52x4] + vpandq ymm13,ymm13,YMMWORD[$L$mask52x4] + vpandq ymm14,ymm14,YMMWORD[$L$mask52x4] + vpandq ymm15,ymm15,YMMWORD[$L$mask52x4] + vpandq ymm16,ymm16,YMMWORD[$L$mask52x4] + vpandq ymm17,ymm17,YMMWORD[$L$mask52x4] + vpandq ymm18,ymm18,YMMWORD[$L$mask52x4] + + + vpaddq ymm11,ymm11,ymm0 + vpaddq ymm12,ymm12,ymm1 + vpaddq ymm13,ymm13,ymm2 + vpaddq ymm14,ymm14,ymm19 + vpaddq ymm15,ymm15,ymm20 + vpaddq ymm16,ymm16,ymm21 + vpaddq ymm17,ymm17,ymm22 + vpaddq ymm18,ymm18,ymm23 + + + + vpcmpuq k1,ymm11,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm12,YMMWORD[$L$mask52x4],6 + kmovb r14d,k1 + kmovb r13d,k2 + shl r13b,4 + or r14b,r13b + + vpcmpuq k1,ymm13,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm14,YMMWORD[$L$mask52x4],6 + kmovb r13d,k1 + kmovb r12d,k2 + shl r12b,4 + or r13b,r12b + + vpcmpuq k1,ymm15,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm16,YMMWORD[$L$mask52x4],6 + kmovb r12d,k1 + kmovb r11d,k2 + shl r11b,4 + or r12b,r11b + + vpcmpuq k1,ymm17,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm18,YMMWORD[$L$mask52x4],6 + kmovb r11d,k1 + kmovb r10d,k2 + shl r10b,4 + or r11b,r10b + + add r14b,r14b + adc r13b,r13b + adc r12b,r12b + adc r11b,r11b + + + vpcmpuq k1,ymm11,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm12,YMMWORD[$L$mask52x4],0 + kmovb r9d,k1 + kmovb r8d,k2 + shl r8b,4 + or r9b,r8b + + vpcmpuq k1,ymm13,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm14,YMMWORD[$L$mask52x4],0 + kmovb r8d,k1 + kmovb edx,k2 + shl dl,4 + or r8b,dl + + vpcmpuq k1,ymm15,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm16,YMMWORD[$L$mask52x4],0 + kmovb edx,k1 + kmovb ecx,k2 + shl cl,4 + or dl,cl + + vpcmpuq k1,ymm17,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm18,YMMWORD[$L$mask52x4],0 + kmovb ecx,k1 + kmovb ebx,k2 + shl bl,4 + or cl,bl + + add r14b,r9b + adc r13b,r8b + adc r12b,dl + adc r11b,cl + + xor r14b,r9b + xor r13b,r8b + xor r12b,dl + xor r11b,cl + + kmovb k1,r14d + shr r14b,4 + kmovb k2,r14d + kmovb k3,r13d + shr r13b,4 + kmovb k4,r13d + kmovb k5,r12d + shr r12b,4 + kmovb k6,r12d + kmovb k7,r11d + + vpsubq ymm11{k1},ymm11,YMMWORD[$L$mask52x4] + vpsubq ymm12{k2},ymm12,YMMWORD[$L$mask52x4] + vpsubq ymm13{k3},ymm13,YMMWORD[$L$mask52x4] + vpsubq ymm14{k4},ymm14,YMMWORD[$L$mask52x4] + vpsubq ymm15{k5},ymm15,YMMWORD[$L$mask52x4] + vpsubq ymm16{k6},ymm16,YMMWORD[$L$mask52x4] + vpsubq ymm17{k7},ymm17,YMMWORD[$L$mask52x4] + + vpandq ymm11,ymm11,YMMWORD[$L$mask52x4] + vpandq ymm12,ymm12,YMMWORD[$L$mask52x4] + vpandq ymm13,ymm13,YMMWORD[$L$mask52x4] + vpandq ymm14,ymm14,YMMWORD[$L$mask52x4] + vpandq ymm15,ymm15,YMMWORD[$L$mask52x4] + vpandq ymm16,ymm16,YMMWORD[$L$mask52x4] + vpandq ymm17,ymm17,YMMWORD[$L$mask52x4] + + shr r11b,4 + kmovb k1,r11d + + vpsubq ymm18{k1},ymm18,YMMWORD[$L$mask52x4] + + vpandq ymm18,ymm18,YMMWORD[$L$mask52x4] + + vmovdqu64 YMMWORD[rdi],ymm3 + vmovdqu64 YMMWORD[32+rdi],ymm4 + vmovdqu64 YMMWORD[64+rdi],ymm5 + vmovdqu64 YMMWORD[96+rdi],ymm6 + vmovdqu64 YMMWORD[128+rdi],ymm7 + vmovdqu64 YMMWORD[160+rdi],ymm8 + vmovdqu64 YMMWORD[192+rdi],ymm9 + vmovdqu64 YMMWORD[224+rdi],ymm10 + + vmovdqu64 YMMWORD[256+rdi],ymm11 + vmovdqu64 YMMWORD[288+rdi],ymm12 + vmovdqu64 YMMWORD[320+rdi],ymm13 + vmovdqu64 YMMWORD[352+rdi],ymm14 + vmovdqu64 YMMWORD[384+rdi],ymm15 + vmovdqu64 YMMWORD[416+rdi],ymm16 + vmovdqu64 YMMWORD[448+rdi],ymm17 + vmovdqu64 YMMWORD[480+rdi],ymm18 + + vzeroupper + lea rax,[rsp] + + vmovdqa64 xmm6,XMMWORD[rax] + vmovdqa64 xmm7,XMMWORD[16+rax] + vmovdqa64 xmm8,XMMWORD[32+rax] + vmovdqa64 xmm9,XMMWORD[48+rax] + vmovdqa64 xmm10,XMMWORD[64+rax] + vmovdqa64 xmm11,XMMWORD[80+rax] + vmovdqa64 xmm12,XMMWORD[96+rax] + vmovdqa64 xmm13,XMMWORD[112+rax] + vmovdqa64 xmm14,XMMWORD[128+rax] + vmovdqa64 xmm15,XMMWORD[144+rax] + lea rax,[168+rsp] + mov r15,QWORD[rax] + + mov r14,QWORD[8+rax] + + mov r13,QWORD[16+rax] + + mov r12,QWORD[24+rax] + + mov rbp,QWORD[32+rax] + + mov rbx,QWORD[40+rax] + + lea rsp,[48+rax] + +$L$ossl_rsaz_amm52x30_x2_ifma256_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ossl_rsaz_amm52x30_x2_ifma256: +section .text code align=64 + + +ALIGN 32 +global ossl_extract_multiplier_2x30_win5 + +ossl_extract_multiplier_2x30_win5: + +DB 243,15,30,250 + vmovdqa64 ymm30,YMMWORD[$L$ones] + vpbroadcastq ymm28,r8 + vpbroadcastq ymm29,r9 + lea rax,[16384+rdx] + + + vpxor xmm0,xmm0,xmm0 + vmovdqa64 ymm27,ymm0 + vmovdqa64 ymm1,ymm0 + vmovdqa64 ymm2,ymm0 + vmovdqa64 ymm3,ymm0 + vmovdqa64 ymm4,ymm0 + vmovdqa64 ymm5,ymm0 + vmovdqa64 ymm16,ymm0 + vmovdqa64 ymm17,ymm0 + vmovdqa64 ymm18,ymm0 + vmovdqa64 ymm19,ymm0 + vmovdqa64 ymm20,ymm0 + vmovdqa64 ymm21,ymm0 + vmovdqa64 ymm22,ymm0 + vmovdqa64 ymm23,ymm0 + vmovdqa64 ymm24,ymm0 + vmovdqa64 ymm25,ymm0 + +ALIGN 32 +$L$loop: + vpcmpq k1,ymm28,ymm27,0 + vpcmpq k2,ymm29,ymm27,0 + vmovdqu64 ymm26,YMMWORD[rdx] + vpblendmq ymm0{k1},ymm0,ymm26 + vmovdqu64 ymm26,YMMWORD[32+rdx] + vpblendmq ymm1{k1},ymm1,ymm26 + vmovdqu64 ymm26,YMMWORD[64+rdx] + vpblendmq ymm2{k1},ymm2,ymm26 + vmovdqu64 ymm26,YMMWORD[96+rdx] + vpblendmq ymm3{k1},ymm3,ymm26 + vmovdqu64 ymm26,YMMWORD[128+rdx] + vpblendmq ymm4{k1},ymm4,ymm26 + vmovdqu64 ymm26,YMMWORD[160+rdx] + vpblendmq ymm5{k1},ymm5,ymm26 + vmovdqu64 ymm26,YMMWORD[192+rdx] + vpblendmq ymm16{k1},ymm16,ymm26 + vmovdqu64 ymm26,YMMWORD[224+rdx] + vpblendmq ymm17{k1},ymm17,ymm26 + vmovdqu64 ymm26,YMMWORD[256+rdx] + vpblendmq ymm18{k2},ymm18,ymm26 + vmovdqu64 ymm26,YMMWORD[288+rdx] + vpblendmq ymm19{k2},ymm19,ymm26 + vmovdqu64 ymm26,YMMWORD[320+rdx] + vpblendmq ymm20{k2},ymm20,ymm26 + vmovdqu64 ymm26,YMMWORD[352+rdx] + vpblendmq ymm21{k2},ymm21,ymm26 + vmovdqu64 ymm26,YMMWORD[384+rdx] + vpblendmq ymm22{k2},ymm22,ymm26 + vmovdqu64 ymm26,YMMWORD[416+rdx] + vpblendmq ymm23{k2},ymm23,ymm26 + vmovdqu64 ymm26,YMMWORD[448+rdx] + vpblendmq ymm24{k2},ymm24,ymm26 + vmovdqu64 ymm26,YMMWORD[480+rdx] + vpblendmq ymm25{k2},ymm25,ymm26 + vpaddq ymm27,ymm27,ymm30 + add rdx,512 + cmp rax,rdx + jne NEAR $L$loop + vmovdqu64 YMMWORD[rcx],ymm0 + vmovdqu64 YMMWORD[32+rcx],ymm1 + vmovdqu64 YMMWORD[64+rcx],ymm2 + vmovdqu64 YMMWORD[96+rcx],ymm3 + vmovdqu64 YMMWORD[128+rcx],ymm4 + vmovdqu64 YMMWORD[160+rcx],ymm5 + vmovdqu64 YMMWORD[192+rcx],ymm16 + vmovdqu64 YMMWORD[224+rcx],ymm17 + vmovdqu64 YMMWORD[256+rcx],ymm18 + vmovdqu64 YMMWORD[288+rcx],ymm19 + vmovdqu64 YMMWORD[320+rcx],ymm20 + vmovdqu64 YMMWORD[352+rcx],ymm21 + vmovdqu64 YMMWORD[384+rcx],ymm22 + vmovdqu64 YMMWORD[416+rcx],ymm23 + vmovdqu64 YMMWORD[448+rcx],ymm24 + vmovdqu64 YMMWORD[480+rcx],ymm25 + + DB 0F3h,0C3h ;repret + + +section .rdata rdata align=32 +ALIGN 32 +$L$ones: + DQ 1,1,1,1 +$L$zeros: + DQ 0,0,0,0 +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +rsaz_avx_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + lea rsi,[rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + + lea rax,[216+rax] + + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_ossl_rsaz_amm52x30_x1_ifma256 wrt ..imagebase + DD $L$SEH_end_ossl_rsaz_amm52x30_x1_ifma256 wrt ..imagebase + DD $L$SEH_info_ossl_rsaz_amm52x30_x1_ifma256 wrt ..imagebase + + DD $L$SEH_begin_ossl_rsaz_amm52x30_x2_ifma256 wrt ..imagebase + DD $L$SEH_end_ossl_rsaz_amm52x30_x2_ifma256 wrt ..imagebase + DD $L$SEH_info_ossl_rsaz_amm52x30_x2_ifma256 wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_ossl_rsaz_amm52x30_x1_ifma256: +DB 9,0,0,0 + DD rsaz_avx_handler wrt ..imagebase + DD $L$ossl_rsaz_amm52x30_x1_ifma256_body wrt ..imagebase,$L$ossl_rsaz_amm52x30_x1_ifma256_epilogue wrt ..imagebase +$L$SEH_info_ossl_rsaz_amm52x30_x2_ifma256: +DB 9,0,0,0 + DD rsaz_avx_handler wrt ..imagebase + DD $L$ossl_rsaz_amm52x30_x2_ifma256_body wrt ..imagebase,$L$ossl_rsaz_amm52x30_x2_ifma256_epilogue wrt ..imagebase diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-4k-avx512.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-4k-avx512.nasm new file mode 100644 index 000000000000..2912374a772c --- /dev/null +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-4k-avx512.nasm @@ -0,0 +1,1533 @@ +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + + +global ossl_rsaz_amm52x40_x1_ifma256 + +ALIGN 32 +ossl_rsaz_amm52x40_x1_ifma256: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ossl_rsaz_amm52x40_x1_ifma256: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +DB 243,15,30,250 + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,[((-168))+rsp] + vmovdqa64 XMMWORD[rsp],xmm6 + vmovdqa64 XMMWORD[16+rsp],xmm7 + vmovdqa64 XMMWORD[32+rsp],xmm8 + vmovdqa64 XMMWORD[48+rsp],xmm9 + vmovdqa64 XMMWORD[64+rsp],xmm10 + vmovdqa64 XMMWORD[80+rsp],xmm11 + vmovdqa64 XMMWORD[96+rsp],xmm12 + vmovdqa64 XMMWORD[112+rsp],xmm13 + vmovdqa64 XMMWORD[128+rsp],xmm14 + vmovdqa64 XMMWORD[144+rsp],xmm15 +$L$ossl_rsaz_amm52x40_x1_ifma256_body: + + vpxord ymm0,ymm0,ymm0 + vmovdqa64 ymm3,ymm0 + vmovdqa64 ymm4,ymm0 + vmovdqa64 ymm5,ymm0 + vmovdqa64 ymm6,ymm0 + vmovdqa64 ymm7,ymm0 + vmovdqa64 ymm8,ymm0 + vmovdqa64 ymm9,ymm0 + vmovdqa64 ymm10,ymm0 + vmovdqa64 ymm11,ymm0 + vmovdqa64 ymm12,ymm0 + + xor r9d,r9d + + mov r11,rdx + mov rax,0xfffffffffffff + + + mov ebx,10 + +ALIGN 32 +$L$loop10: + mov r13,QWORD[r11] + + vpbroadcastq ymm1,r13 + mov rdx,QWORD[rsi] + mulx r12,r13,r13 + add r9,r13 + mov r10,r12 + adc r10,0 + + mov r13,r8 + imul r13,r9 + and r13,rax + + vpbroadcastq ymm2,r13 + mov rdx,QWORD[rcx] + mulx r12,r13,r13 + add r9,r13 + adc r10,r12 + + shr r9,52 + sal r10,12 + or r9,r10 + + vpmadd52luq ymm3,ymm1,YMMWORD[rsi] + vpmadd52luq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52luq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52luq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52luq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52luq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52luq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52luq ymm10,ymm1,YMMWORD[224+rsi] + vpmadd52luq ymm11,ymm1,YMMWORD[256+rsi] + vpmadd52luq ymm12,ymm1,YMMWORD[288+rsi] + + vpmadd52luq ymm3,ymm2,YMMWORD[rcx] + vpmadd52luq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52luq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52luq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52luq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52luq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52luq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52luq ymm10,ymm2,YMMWORD[224+rcx] + vpmadd52luq ymm11,ymm2,YMMWORD[256+rcx] + vpmadd52luq ymm12,ymm2,YMMWORD[288+rcx] + + + valignq ymm3,ymm4,ymm3,1 + valignq ymm4,ymm5,ymm4,1 + valignq ymm5,ymm6,ymm5,1 + valignq ymm6,ymm7,ymm6,1 + valignq ymm7,ymm8,ymm7,1 + valignq ymm8,ymm9,ymm8,1 + valignq ymm9,ymm10,ymm9,1 + valignq ymm10,ymm11,ymm10,1 + valignq ymm11,ymm12,ymm11,1 + valignq ymm12,ymm0,ymm12,1 + + vmovq r13,xmm3 + add r9,r13 + + vpmadd52huq ymm3,ymm1,YMMWORD[rsi] + vpmadd52huq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52huq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52huq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52huq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52huq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52huq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52huq ymm10,ymm1,YMMWORD[224+rsi] + vpmadd52huq ymm11,ymm1,YMMWORD[256+rsi] + vpmadd52huq ymm12,ymm1,YMMWORD[288+rsi] + + vpmadd52huq ymm3,ymm2,YMMWORD[rcx] + vpmadd52huq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52huq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52huq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52huq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52huq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52huq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52huq ymm10,ymm2,YMMWORD[224+rcx] + vpmadd52huq ymm11,ymm2,YMMWORD[256+rcx] + vpmadd52huq ymm12,ymm2,YMMWORD[288+rcx] + mov r13,QWORD[8+r11] + + vpbroadcastq ymm1,r13 + mov rdx,QWORD[rsi] + mulx r12,r13,r13 + add r9,r13 + mov r10,r12 + adc r10,0 + + mov r13,r8 + imul r13,r9 + and r13,rax + + vpbroadcastq ymm2,r13 + mov rdx,QWORD[rcx] + mulx r12,r13,r13 + add r9,r13 + adc r10,r12 + + shr r9,52 + sal r10,12 + or r9,r10 + + vpmadd52luq ymm3,ymm1,YMMWORD[rsi] + vpmadd52luq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52luq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52luq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52luq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52luq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52luq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52luq ymm10,ymm1,YMMWORD[224+rsi] + vpmadd52luq ymm11,ymm1,YMMWORD[256+rsi] + vpmadd52luq ymm12,ymm1,YMMWORD[288+rsi] + + vpmadd52luq ymm3,ymm2,YMMWORD[rcx] + vpmadd52luq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52luq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52luq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52luq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52luq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52luq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52luq ymm10,ymm2,YMMWORD[224+rcx] + vpmadd52luq ymm11,ymm2,YMMWORD[256+rcx] + vpmadd52luq ymm12,ymm2,YMMWORD[288+rcx] + + + valignq ymm3,ymm4,ymm3,1 + valignq ymm4,ymm5,ymm4,1 + valignq ymm5,ymm6,ymm5,1 + valignq ymm6,ymm7,ymm6,1 + valignq ymm7,ymm8,ymm7,1 + valignq ymm8,ymm9,ymm8,1 + valignq ymm9,ymm10,ymm9,1 + valignq ymm10,ymm11,ymm10,1 + valignq ymm11,ymm12,ymm11,1 + valignq ymm12,ymm0,ymm12,1 + + vmovq r13,xmm3 + add r9,r13 + + vpmadd52huq ymm3,ymm1,YMMWORD[rsi] + vpmadd52huq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52huq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52huq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52huq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52huq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52huq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52huq ymm10,ymm1,YMMWORD[224+rsi] + vpmadd52huq ymm11,ymm1,YMMWORD[256+rsi] + vpmadd52huq ymm12,ymm1,YMMWORD[288+rsi] + + vpmadd52huq ymm3,ymm2,YMMWORD[rcx] + vpmadd52huq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52huq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52huq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52huq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52huq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52huq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52huq ymm10,ymm2,YMMWORD[224+rcx] + vpmadd52huq ymm11,ymm2,YMMWORD[256+rcx] + vpmadd52huq ymm12,ymm2,YMMWORD[288+rcx] + mov r13,QWORD[16+r11] + + vpbroadcastq ymm1,r13 + mov rdx,QWORD[rsi] + mulx r12,r13,r13 + add r9,r13 + mov r10,r12 + adc r10,0 + + mov r13,r8 + imul r13,r9 + and r13,rax + + vpbroadcastq ymm2,r13 + mov rdx,QWORD[rcx] + mulx r12,r13,r13 + add r9,r13 + adc r10,r12 + + shr r9,52 + sal r10,12 + or r9,r10 + + vpmadd52luq ymm3,ymm1,YMMWORD[rsi] + vpmadd52luq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52luq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52luq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52luq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52luq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52luq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52luq ymm10,ymm1,YMMWORD[224+rsi] + vpmadd52luq ymm11,ymm1,YMMWORD[256+rsi] + vpmadd52luq ymm12,ymm1,YMMWORD[288+rsi] + + vpmadd52luq ymm3,ymm2,YMMWORD[rcx] + vpmadd52luq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52luq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52luq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52luq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52luq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52luq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52luq ymm10,ymm2,YMMWORD[224+rcx] + vpmadd52luq ymm11,ymm2,YMMWORD[256+rcx] + vpmadd52luq ymm12,ymm2,YMMWORD[288+rcx] + + + valignq ymm3,ymm4,ymm3,1 + valignq ymm4,ymm5,ymm4,1 + valignq ymm5,ymm6,ymm5,1 + valignq ymm6,ymm7,ymm6,1 + valignq ymm7,ymm8,ymm7,1 + valignq ymm8,ymm9,ymm8,1 + valignq ymm9,ymm10,ymm9,1 + valignq ymm10,ymm11,ymm10,1 + valignq ymm11,ymm12,ymm11,1 + valignq ymm12,ymm0,ymm12,1 + + vmovq r13,xmm3 + add r9,r13 + + vpmadd52huq ymm3,ymm1,YMMWORD[rsi] + vpmadd52huq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52huq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52huq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52huq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52huq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52huq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52huq ymm10,ymm1,YMMWORD[224+rsi] + vpmadd52huq ymm11,ymm1,YMMWORD[256+rsi] + vpmadd52huq ymm12,ymm1,YMMWORD[288+rsi] + + vpmadd52huq ymm3,ymm2,YMMWORD[rcx] + vpmadd52huq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52huq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52huq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52huq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52huq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52huq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52huq ymm10,ymm2,YMMWORD[224+rcx] + vpmadd52huq ymm11,ymm2,YMMWORD[256+rcx] + vpmadd52huq ymm12,ymm2,YMMWORD[288+rcx] + mov r13,QWORD[24+r11] + + vpbroadcastq ymm1,r13 + mov rdx,QWORD[rsi] + mulx r12,r13,r13 + add r9,r13 + mov r10,r12 + adc r10,0 + + mov r13,r8 + imul r13,r9 + and r13,rax + + vpbroadcastq ymm2,r13 + mov rdx,QWORD[rcx] + mulx r12,r13,r13 + add r9,r13 + adc r10,r12 + + shr r9,52 + sal r10,12 + or r9,r10 + + vpmadd52luq ymm3,ymm1,YMMWORD[rsi] + vpmadd52luq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52luq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52luq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52luq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52luq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52luq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52luq ymm10,ymm1,YMMWORD[224+rsi] + vpmadd52luq ymm11,ymm1,YMMWORD[256+rsi] + vpmadd52luq ymm12,ymm1,YMMWORD[288+rsi] + + vpmadd52luq ymm3,ymm2,YMMWORD[rcx] + vpmadd52luq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52luq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52luq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52luq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52luq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52luq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52luq ymm10,ymm2,YMMWORD[224+rcx] + vpmadd52luq ymm11,ymm2,YMMWORD[256+rcx] + vpmadd52luq ymm12,ymm2,YMMWORD[288+rcx] + + + valignq ymm3,ymm4,ymm3,1 + valignq ymm4,ymm5,ymm4,1 + valignq ymm5,ymm6,ymm5,1 + valignq ymm6,ymm7,ymm6,1 + valignq ymm7,ymm8,ymm7,1 + valignq ymm8,ymm9,ymm8,1 + valignq ymm9,ymm10,ymm9,1 + valignq ymm10,ymm11,ymm10,1 + valignq ymm11,ymm12,ymm11,1 + valignq ymm12,ymm0,ymm12,1 + + vmovq r13,xmm3 + add r9,r13 + + vpmadd52huq ymm3,ymm1,YMMWORD[rsi] + vpmadd52huq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52huq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52huq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52huq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52huq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52huq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52huq ymm10,ymm1,YMMWORD[224+rsi] + vpmadd52huq ymm11,ymm1,YMMWORD[256+rsi] + vpmadd52huq ymm12,ymm1,YMMWORD[288+rsi] + + vpmadd52huq ymm3,ymm2,YMMWORD[rcx] + vpmadd52huq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52huq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52huq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52huq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52huq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52huq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52huq ymm10,ymm2,YMMWORD[224+rcx] + vpmadd52huq ymm11,ymm2,YMMWORD[256+rcx] + vpmadd52huq ymm12,ymm2,YMMWORD[288+rcx] + lea r11,[32+r11] + dec ebx + jne NEAR $L$loop10 + + vpbroadcastq ymm0,r9 + vpblendd ymm3,ymm3,ymm0,3 + + + + vpsrlq ymm0,ymm3,52 + vpsrlq ymm1,ymm4,52 + vpsrlq ymm2,ymm5,52 + vpsrlq ymm23,ymm6,52 + vpsrlq ymm24,ymm7,52 + vpsrlq ymm25,ymm8,52 + vpsrlq ymm26,ymm9,52 + vpsrlq ymm27,ymm10,52 + vpsrlq ymm28,ymm11,52 + vpsrlq ymm29,ymm12,52 + + + valignq ymm29,ymm29,ymm28,3 + valignq ymm28,ymm28,ymm27,3 + valignq ymm27,ymm27,ymm26,3 + valignq ymm26,ymm26,ymm25,3 + valignq ymm25,ymm25,ymm24,3 + valignq ymm24,ymm24,ymm23,3 + valignq ymm23,ymm23,ymm2,3 + valignq ymm2,ymm2,ymm1,3 + valignq ymm1,ymm1,ymm0,3 + valignq ymm0,ymm0,YMMWORD[$L$zeros],3 + + + vpandq ymm3,ymm3,YMMWORD[$L$mask52x4] + vpandq ymm4,ymm4,YMMWORD[$L$mask52x4] + vpandq ymm5,ymm5,YMMWORD[$L$mask52x4] + vpandq ymm6,ymm6,YMMWORD[$L$mask52x4] + vpandq ymm7,ymm7,YMMWORD[$L$mask52x4] + vpandq ymm8,ymm8,YMMWORD[$L$mask52x4] + vpandq ymm9,ymm9,YMMWORD[$L$mask52x4] + vpandq ymm10,ymm10,YMMWORD[$L$mask52x4] + vpandq ymm11,ymm11,YMMWORD[$L$mask52x4] + vpandq ymm12,ymm12,YMMWORD[$L$mask52x4] + + + vpaddq ymm3,ymm3,ymm0 + vpaddq ymm4,ymm4,ymm1 + vpaddq ymm5,ymm5,ymm2 + vpaddq ymm6,ymm6,ymm23 + vpaddq ymm7,ymm7,ymm24 + vpaddq ymm8,ymm8,ymm25 + vpaddq ymm9,ymm9,ymm26 + vpaddq ymm10,ymm10,ymm27 + vpaddq ymm11,ymm11,ymm28 + vpaddq ymm12,ymm12,ymm29 + + + + vpcmpuq k1,ymm3,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm4,YMMWORD[$L$mask52x4],6 + kmovb r14d,k1 + kmovb r13d,k2 + shl r13b,4 + or r14b,r13b + + vpcmpuq k1,ymm5,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm6,YMMWORD[$L$mask52x4],6 + kmovb r13d,k1 + kmovb r12d,k2 + shl r12b,4 + or r13b,r12b + + vpcmpuq k1,ymm7,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm8,YMMWORD[$L$mask52x4],6 + kmovb r12d,k1 + kmovb r11d,k2 + shl r11b,4 + or r12b,r11b + + vpcmpuq k1,ymm9,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm10,YMMWORD[$L$mask52x4],6 + kmovb r11d,k1 + kmovb r10d,k2 + shl r10b,4 + or r11b,r10b + + vpcmpuq k1,ymm11,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm12,YMMWORD[$L$mask52x4],6 + kmovb r10d,k1 + kmovb r9d,k2 + shl r9b,4 + or r10b,r9b + + add r14b,r14b + adc r13b,r13b + adc r12b,r12b + adc r11b,r11b + adc r10b,r10b + + + vpcmpuq k1,ymm3,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm4,YMMWORD[$L$mask52x4],0 + kmovb r9d,k1 + kmovb r8d,k2 + shl r8b,4 + or r9b,r8b + + vpcmpuq k1,ymm5,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm6,YMMWORD[$L$mask52x4],0 + kmovb r8d,k1 + kmovb edx,k2 + shl dl,4 + or r8b,dl + + vpcmpuq k1,ymm7,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm8,YMMWORD[$L$mask52x4],0 + kmovb edx,k1 + kmovb ecx,k2 + shl cl,4 + or dl,cl + + vpcmpuq k1,ymm9,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm10,YMMWORD[$L$mask52x4],0 + kmovb ecx,k1 + kmovb ebx,k2 + shl bl,4 + or cl,bl + + vpcmpuq k1,ymm11,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm12,YMMWORD[$L$mask52x4],0 + kmovb ebx,k1 + kmovb eax,k2 + shl al,4 + or bl,al + + add r14b,r9b + adc r13b,r8b + adc r12b,dl + adc r11b,cl + adc r10b,bl + + xor r14b,r9b + xor r13b,r8b + xor r12b,dl + xor r11b,cl + xor r10b,bl + + kmovb k1,r14d + shr r14b,4 + kmovb k2,r14d + kmovb k3,r13d + shr r13b,4 + kmovb k4,r13d + kmovb k5,r12d + shr r12b,4 + kmovb k6,r12d + kmovb k7,r11d + + vpsubq ymm3{k1},ymm3,YMMWORD[$L$mask52x4] + vpsubq ymm4{k2},ymm4,YMMWORD[$L$mask52x4] + vpsubq ymm5{k3},ymm5,YMMWORD[$L$mask52x4] + vpsubq ymm6{k4},ymm6,YMMWORD[$L$mask52x4] + vpsubq ymm7{k5},ymm7,YMMWORD[$L$mask52x4] + vpsubq ymm8{k6},ymm8,YMMWORD[$L$mask52x4] + vpsubq ymm9{k7},ymm9,YMMWORD[$L$mask52x4] + + vpandq ymm3,ymm3,YMMWORD[$L$mask52x4] + vpandq ymm4,ymm4,YMMWORD[$L$mask52x4] + vpandq ymm5,ymm5,YMMWORD[$L$mask52x4] + vpandq ymm6,ymm6,YMMWORD[$L$mask52x4] + vpandq ymm7,ymm7,YMMWORD[$L$mask52x4] + vpandq ymm8,ymm8,YMMWORD[$L$mask52x4] + vpandq ymm9,ymm9,YMMWORD[$L$mask52x4] + + shr r11b,4 + kmovb k1,r11d + kmovb k2,r10d + shr r10b,4 + kmovb k3,r10d + + vpsubq ymm10{k1},ymm10,YMMWORD[$L$mask52x4] + vpsubq ymm11{k2},ymm11,YMMWORD[$L$mask52x4] + vpsubq ymm12{k3},ymm12,YMMWORD[$L$mask52x4] + + vpandq ymm10,ymm10,YMMWORD[$L$mask52x4] + vpandq ymm11,ymm11,YMMWORD[$L$mask52x4] + vpandq ymm12,ymm12,YMMWORD[$L$mask52x4] + + vmovdqu64 YMMWORD[rdi],ymm3 + vmovdqu64 YMMWORD[32+rdi],ymm4 + vmovdqu64 YMMWORD[64+rdi],ymm5 + vmovdqu64 YMMWORD[96+rdi],ymm6 + vmovdqu64 YMMWORD[128+rdi],ymm7 + vmovdqu64 YMMWORD[160+rdi],ymm8 + vmovdqu64 YMMWORD[192+rdi],ymm9 + vmovdqu64 YMMWORD[224+rdi],ymm10 + vmovdqu64 YMMWORD[256+rdi],ymm11 + vmovdqu64 YMMWORD[288+rdi],ymm12 + + vzeroupper + lea rax,[rsp] + + vmovdqa64 xmm6,XMMWORD[rax] + vmovdqa64 xmm7,XMMWORD[16+rax] + vmovdqa64 xmm8,XMMWORD[32+rax] + vmovdqa64 xmm9,XMMWORD[48+rax] + vmovdqa64 xmm10,XMMWORD[64+rax] + vmovdqa64 xmm11,XMMWORD[80+rax] + vmovdqa64 xmm12,XMMWORD[96+rax] + vmovdqa64 xmm13,XMMWORD[112+rax] + vmovdqa64 xmm14,XMMWORD[128+rax] + vmovdqa64 xmm15,XMMWORD[144+rax] + lea rax,[168+rsp] + mov r15,QWORD[rax] + + mov r14,QWORD[8+rax] + + mov r13,QWORD[16+rax] + + mov r12,QWORD[24+rax] + + mov rbp,QWORD[32+rax] + + mov rbx,QWORD[40+rax] + + lea rsp,[48+rax] + +$L$ossl_rsaz_amm52x40_x1_ifma256_epilogue: + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ossl_rsaz_amm52x40_x1_ifma256: +section .rdata rdata align=32 +ALIGN 32 +$L$mask52x4: + DQ 0xfffffffffffff + DQ 0xfffffffffffff + DQ 0xfffffffffffff + DQ 0xfffffffffffff +section .text code align=64 + + +global ossl_rsaz_amm52x40_x2_ifma256 + +ALIGN 32 +ossl_rsaz_amm52x40_x2_ifma256: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ossl_rsaz_amm52x40_x2_ifma256: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +DB 243,15,30,250 + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,[((-168))+rsp] + vmovdqa64 XMMWORD[rsp],xmm6 + vmovdqa64 XMMWORD[16+rsp],xmm7 + vmovdqa64 XMMWORD[32+rsp],xmm8 + vmovdqa64 XMMWORD[48+rsp],xmm9 + vmovdqa64 XMMWORD[64+rsp],xmm10 + vmovdqa64 XMMWORD[80+rsp],xmm11 + vmovdqa64 XMMWORD[96+rsp],xmm12 + vmovdqa64 XMMWORD[112+rsp],xmm13 + vmovdqa64 XMMWORD[128+rsp],xmm14 + vmovdqa64 XMMWORD[144+rsp],xmm15 +$L$ossl_rsaz_amm52x40_x2_ifma256_body: + + vpxord ymm0,ymm0,ymm0 + vmovdqa64 ymm3,ymm0 + vmovdqa64 ymm4,ymm0 + vmovdqa64 ymm5,ymm0 + vmovdqa64 ymm6,ymm0 + vmovdqa64 ymm7,ymm0 + vmovdqa64 ymm8,ymm0 + vmovdqa64 ymm9,ymm0 + vmovdqa64 ymm10,ymm0 + vmovdqa64 ymm11,ymm0 + vmovdqa64 ymm12,ymm0 + + vmovdqa64 ymm13,ymm0 + vmovdqa64 ymm14,ymm0 + vmovdqa64 ymm15,ymm0 + vmovdqa64 ymm16,ymm0 + vmovdqa64 ymm17,ymm0 + vmovdqa64 ymm18,ymm0 + vmovdqa64 ymm19,ymm0 + vmovdqa64 ymm20,ymm0 + vmovdqa64 ymm21,ymm0 + vmovdqa64 ymm22,ymm0 + + + xor r9d,r9d + xor r15d,r15d + + mov r11,rdx + mov rax,0xfffffffffffff + + mov ebx,40 + +ALIGN 32 +$L$loop40: + mov r13,QWORD[r11] + + vpbroadcastq ymm1,r13 + mov rdx,QWORD[rsi] + mulx r12,r13,r13 + add r9,r13 + mov r10,r12 + adc r10,0 + + mov r13,QWORD[r8] + imul r13,r9 + and r13,rax + + vpbroadcastq ymm2,r13 + mov rdx,QWORD[rcx] + mulx r12,r13,r13 + add r9,r13 + adc r10,r12 + + shr r9,52 + sal r10,12 + or r9,r10 + + vpmadd52luq ymm3,ymm1,YMMWORD[rsi] + vpmadd52luq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52luq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52luq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52luq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52luq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52luq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52luq ymm10,ymm1,YMMWORD[224+rsi] + vpmadd52luq ymm11,ymm1,YMMWORD[256+rsi] + vpmadd52luq ymm12,ymm1,YMMWORD[288+rsi] + + vpmadd52luq ymm3,ymm2,YMMWORD[rcx] + vpmadd52luq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52luq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52luq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52luq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52luq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52luq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52luq ymm10,ymm2,YMMWORD[224+rcx] + vpmadd52luq ymm11,ymm2,YMMWORD[256+rcx] + vpmadd52luq ymm12,ymm2,YMMWORD[288+rcx] + + + valignq ymm3,ymm4,ymm3,1 + valignq ymm4,ymm5,ymm4,1 + valignq ymm5,ymm6,ymm5,1 + valignq ymm6,ymm7,ymm6,1 + valignq ymm7,ymm8,ymm7,1 + valignq ymm8,ymm9,ymm8,1 + valignq ymm9,ymm10,ymm9,1 + valignq ymm10,ymm11,ymm10,1 + valignq ymm11,ymm12,ymm11,1 + valignq ymm12,ymm0,ymm12,1 + + vmovq r13,xmm3 + add r9,r13 + + vpmadd52huq ymm3,ymm1,YMMWORD[rsi] + vpmadd52huq ymm4,ymm1,YMMWORD[32+rsi] + vpmadd52huq ymm5,ymm1,YMMWORD[64+rsi] + vpmadd52huq ymm6,ymm1,YMMWORD[96+rsi] + vpmadd52huq ymm7,ymm1,YMMWORD[128+rsi] + vpmadd52huq ymm8,ymm1,YMMWORD[160+rsi] + vpmadd52huq ymm9,ymm1,YMMWORD[192+rsi] + vpmadd52huq ymm10,ymm1,YMMWORD[224+rsi] + vpmadd52huq ymm11,ymm1,YMMWORD[256+rsi] + vpmadd52huq ymm12,ymm1,YMMWORD[288+rsi] + + vpmadd52huq ymm3,ymm2,YMMWORD[rcx] + vpmadd52huq ymm4,ymm2,YMMWORD[32+rcx] + vpmadd52huq ymm5,ymm2,YMMWORD[64+rcx] + vpmadd52huq ymm6,ymm2,YMMWORD[96+rcx] + vpmadd52huq ymm7,ymm2,YMMWORD[128+rcx] + vpmadd52huq ymm8,ymm2,YMMWORD[160+rcx] + vpmadd52huq ymm9,ymm2,YMMWORD[192+rcx] + vpmadd52huq ymm10,ymm2,YMMWORD[224+rcx] + vpmadd52huq ymm11,ymm2,YMMWORD[256+rcx] + vpmadd52huq ymm12,ymm2,YMMWORD[288+rcx] + mov r13,QWORD[320+r11] + + vpbroadcastq ymm1,r13 + mov rdx,QWORD[320+rsi] + mulx r12,r13,r13 + add r15,r13 + mov r10,r12 + adc r10,0 + + mov r13,QWORD[8+r8] + imul r13,r15 + and r13,rax + + vpbroadcastq ymm2,r13 + mov rdx,QWORD[320+rcx] + mulx r12,r13,r13 + add r15,r13 + adc r10,r12 + + shr r15,52 + sal r10,12 + or r15,r10 + + vpmadd52luq ymm13,ymm1,YMMWORD[320+rsi] + vpmadd52luq ymm14,ymm1,YMMWORD[352+rsi] + vpmadd52luq ymm15,ymm1,YMMWORD[384+rsi] + vpmadd52luq ymm16,ymm1,YMMWORD[416+rsi] + vpmadd52luq ymm17,ymm1,YMMWORD[448+rsi] + vpmadd52luq ymm18,ymm1,YMMWORD[480+rsi] + vpmadd52luq ymm19,ymm1,YMMWORD[512+rsi] + vpmadd52luq ymm20,ymm1,YMMWORD[544+rsi] + vpmadd52luq ymm21,ymm1,YMMWORD[576+rsi] + vpmadd52luq ymm22,ymm1,YMMWORD[608+rsi] + + vpmadd52luq ymm13,ymm2,YMMWORD[320+rcx] + vpmadd52luq ymm14,ymm2,YMMWORD[352+rcx] + vpmadd52luq ymm15,ymm2,YMMWORD[384+rcx] + vpmadd52luq ymm16,ymm2,YMMWORD[416+rcx] + vpmadd52luq ymm17,ymm2,YMMWORD[448+rcx] + vpmadd52luq ymm18,ymm2,YMMWORD[480+rcx] + vpmadd52luq ymm19,ymm2,YMMWORD[512+rcx] + vpmadd52luq ymm20,ymm2,YMMWORD[544+rcx] + vpmadd52luq ymm21,ymm2,YMMWORD[576+rcx] + vpmadd52luq ymm22,ymm2,YMMWORD[608+rcx] + + + valignq ymm13,ymm14,ymm13,1 + valignq ymm14,ymm15,ymm14,1 + valignq ymm15,ymm16,ymm15,1 + valignq ymm16,ymm17,ymm16,1 + valignq ymm17,ymm18,ymm17,1 + valignq ymm18,ymm19,ymm18,1 + valignq ymm19,ymm20,ymm19,1 + valignq ymm20,ymm21,ymm20,1 + valignq ymm21,ymm22,ymm21,1 + valignq ymm22,ymm0,ymm22,1 + + vmovq r13,xmm13 + add r15,r13 + + vpmadd52huq ymm13,ymm1,YMMWORD[320+rsi] + vpmadd52huq ymm14,ymm1,YMMWORD[352+rsi] + vpmadd52huq ymm15,ymm1,YMMWORD[384+rsi] + vpmadd52huq ymm16,ymm1,YMMWORD[416+rsi] + vpmadd52huq ymm17,ymm1,YMMWORD[448+rsi] + vpmadd52huq ymm18,ymm1,YMMWORD[480+rsi] + vpmadd52huq ymm19,ymm1,YMMWORD[512+rsi] + vpmadd52huq ymm20,ymm1,YMMWORD[544+rsi] + vpmadd52huq ymm21,ymm1,YMMWORD[576+rsi] + vpmadd52huq ymm22,ymm1,YMMWORD[608+rsi] + + vpmadd52huq ymm13,ymm2,YMMWORD[320+rcx] + vpmadd52huq ymm14,ymm2,YMMWORD[352+rcx] + vpmadd52huq ymm15,ymm2,YMMWORD[384+rcx] + vpmadd52huq ymm16,ymm2,YMMWORD[416+rcx] + vpmadd52huq ymm17,ymm2,YMMWORD[448+rcx] + vpmadd52huq ymm18,ymm2,YMMWORD[480+rcx] + vpmadd52huq ymm19,ymm2,YMMWORD[512+rcx] + vpmadd52huq ymm20,ymm2,YMMWORD[544+rcx] + vpmadd52huq ymm21,ymm2,YMMWORD[576+rcx] + vpmadd52huq ymm22,ymm2,YMMWORD[608+rcx] + lea r11,[8+r11] + dec ebx + jne NEAR $L$loop40 + + vpbroadcastq ymm0,r9 + vpblendd ymm3,ymm3,ymm0,3 + + + + vpsrlq ymm0,ymm3,52 + vpsrlq ymm1,ymm4,52 + vpsrlq ymm2,ymm5,52 + vpsrlq ymm23,ymm6,52 + vpsrlq ymm24,ymm7,52 + vpsrlq ymm25,ymm8,52 + vpsrlq ymm26,ymm9,52 + vpsrlq ymm27,ymm10,52 + vpsrlq ymm28,ymm11,52 + vpsrlq ymm29,ymm12,52 + + + valignq ymm29,ymm29,ymm28,3 + valignq ymm28,ymm28,ymm27,3 + valignq ymm27,ymm27,ymm26,3 + valignq ymm26,ymm26,ymm25,3 + valignq ymm25,ymm25,ymm24,3 + valignq ymm24,ymm24,ymm23,3 + valignq ymm23,ymm23,ymm2,3 + valignq ymm2,ymm2,ymm1,3 + valignq ymm1,ymm1,ymm0,3 + valignq ymm0,ymm0,YMMWORD[$L$zeros],3 + + + vpandq ymm3,ymm3,YMMWORD[$L$mask52x4] + vpandq ymm4,ymm4,YMMWORD[$L$mask52x4] + vpandq ymm5,ymm5,YMMWORD[$L$mask52x4] + vpandq ymm6,ymm6,YMMWORD[$L$mask52x4] + vpandq ymm7,ymm7,YMMWORD[$L$mask52x4] + vpandq ymm8,ymm8,YMMWORD[$L$mask52x4] + vpandq ymm9,ymm9,YMMWORD[$L$mask52x4] + vpandq ymm10,ymm10,YMMWORD[$L$mask52x4] + vpandq ymm11,ymm11,YMMWORD[$L$mask52x4] + vpandq ymm12,ymm12,YMMWORD[$L$mask52x4] + + + vpaddq ymm3,ymm3,ymm0 + vpaddq ymm4,ymm4,ymm1 + vpaddq ymm5,ymm5,ymm2 + vpaddq ymm6,ymm6,ymm23 + vpaddq ymm7,ymm7,ymm24 + vpaddq ymm8,ymm8,ymm25 + vpaddq ymm9,ymm9,ymm26 + vpaddq ymm10,ymm10,ymm27 + vpaddq ymm11,ymm11,ymm28 + vpaddq ymm12,ymm12,ymm29 + + + + vpcmpuq k1,ymm3,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm4,YMMWORD[$L$mask52x4],6 + kmovb r14d,k1 + kmovb r13d,k2 + shl r13b,4 + or r14b,r13b + + vpcmpuq k1,ymm5,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm6,YMMWORD[$L$mask52x4],6 + kmovb r13d,k1 + kmovb r12d,k2 + shl r12b,4 + or r13b,r12b + + vpcmpuq k1,ymm7,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm8,YMMWORD[$L$mask52x4],6 + kmovb r12d,k1 + kmovb r11d,k2 + shl r11b,4 + or r12b,r11b + + vpcmpuq k1,ymm9,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm10,YMMWORD[$L$mask52x4],6 + kmovb r11d,k1 + kmovb r10d,k2 + shl r10b,4 + or r11b,r10b + + vpcmpuq k1,ymm11,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm12,YMMWORD[$L$mask52x4],6 + kmovb r10d,k1 + kmovb r9d,k2 + shl r9b,4 + or r10b,r9b + + add r14b,r14b + adc r13b,r13b + adc r12b,r12b + adc r11b,r11b + adc r10b,r10b + + + vpcmpuq k1,ymm3,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm4,YMMWORD[$L$mask52x4],0 + kmovb r9d,k1 + kmovb r8d,k2 + shl r8b,4 + or r9b,r8b + + vpcmpuq k1,ymm5,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm6,YMMWORD[$L$mask52x4],0 + kmovb r8d,k1 + kmovb edx,k2 + shl dl,4 + or r8b,dl + + vpcmpuq k1,ymm7,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm8,YMMWORD[$L$mask52x4],0 + kmovb edx,k1 + kmovb ecx,k2 + shl cl,4 + or dl,cl + + vpcmpuq k1,ymm9,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm10,YMMWORD[$L$mask52x4],0 + kmovb ecx,k1 + kmovb ebx,k2 + shl bl,4 + or cl,bl + + vpcmpuq k1,ymm11,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm12,YMMWORD[$L$mask52x4],0 + kmovb ebx,k1 + kmovb eax,k2 + shl al,4 + or bl,al + + add r14b,r9b + adc r13b,r8b + adc r12b,dl + adc r11b,cl + adc r10b,bl + + xor r14b,r9b + xor r13b,r8b + xor r12b,dl + xor r11b,cl + xor r10b,bl + + kmovb k1,r14d + shr r14b,4 + kmovb k2,r14d + kmovb k3,r13d + shr r13b,4 + kmovb k4,r13d + kmovb k5,r12d + shr r12b,4 + kmovb k6,r12d + kmovb k7,r11d + + vpsubq ymm3{k1},ymm3,YMMWORD[$L$mask52x4] + vpsubq ymm4{k2},ymm4,YMMWORD[$L$mask52x4] + vpsubq ymm5{k3},ymm5,YMMWORD[$L$mask52x4] + vpsubq ymm6{k4},ymm6,YMMWORD[$L$mask52x4] + vpsubq ymm7{k5},ymm7,YMMWORD[$L$mask52x4] + vpsubq ymm8{k6},ymm8,YMMWORD[$L$mask52x4] + vpsubq ymm9{k7},ymm9,YMMWORD[$L$mask52x4] + + vpandq ymm3,ymm3,YMMWORD[$L$mask52x4] + vpandq ymm4,ymm4,YMMWORD[$L$mask52x4] + vpandq ymm5,ymm5,YMMWORD[$L$mask52x4] + vpandq ymm6,ymm6,YMMWORD[$L$mask52x4] + vpandq ymm7,ymm7,YMMWORD[$L$mask52x4] + vpandq ymm8,ymm8,YMMWORD[$L$mask52x4] + vpandq ymm9,ymm9,YMMWORD[$L$mask52x4] + + shr r11b,4 + kmovb k1,r11d + kmovb k2,r10d + shr r10b,4 + kmovb k3,r10d + + vpsubq ymm10{k1},ymm10,YMMWORD[$L$mask52x4] + vpsubq ymm11{k2},ymm11,YMMWORD[$L$mask52x4] + vpsubq ymm12{k3},ymm12,YMMWORD[$L$mask52x4] + + vpandq ymm10,ymm10,YMMWORD[$L$mask52x4] + vpandq ymm11,ymm11,YMMWORD[$L$mask52x4] + vpandq ymm12,ymm12,YMMWORD[$L$mask52x4] + + vpbroadcastq ymm0,r15 + vpblendd ymm13,ymm13,ymm0,3 + + + + vpsrlq ymm0,ymm13,52 + vpsrlq ymm1,ymm14,52 + vpsrlq ymm2,ymm15,52 + vpsrlq ymm23,ymm16,52 + vpsrlq ymm24,ymm17,52 + vpsrlq ymm25,ymm18,52 + vpsrlq ymm26,ymm19,52 + vpsrlq ymm27,ymm20,52 + vpsrlq ymm28,ymm21,52 + vpsrlq ymm29,ymm22,52 + + + valignq ymm29,ymm29,ymm28,3 + valignq ymm28,ymm28,ymm27,3 + valignq ymm27,ymm27,ymm26,3 + valignq ymm26,ymm26,ymm25,3 + valignq ymm25,ymm25,ymm24,3 + valignq ymm24,ymm24,ymm23,3 + valignq ymm23,ymm23,ymm2,3 + valignq ymm2,ymm2,ymm1,3 + valignq ymm1,ymm1,ymm0,3 + valignq ymm0,ymm0,YMMWORD[$L$zeros],3 + + + vpandq ymm13,ymm13,YMMWORD[$L$mask52x4] + vpandq ymm14,ymm14,YMMWORD[$L$mask52x4] + vpandq ymm15,ymm15,YMMWORD[$L$mask52x4] + vpandq ymm16,ymm16,YMMWORD[$L$mask52x4] + vpandq ymm17,ymm17,YMMWORD[$L$mask52x4] + vpandq ymm18,ymm18,YMMWORD[$L$mask52x4] + vpandq ymm19,ymm19,YMMWORD[$L$mask52x4] + vpandq ymm20,ymm20,YMMWORD[$L$mask52x4] + vpandq ymm21,ymm21,YMMWORD[$L$mask52x4] + vpandq ymm22,ymm22,YMMWORD[$L$mask52x4] + + + vpaddq ymm13,ymm13,ymm0 + vpaddq ymm14,ymm14,ymm1 + vpaddq ymm15,ymm15,ymm2 + vpaddq ymm16,ymm16,ymm23 + vpaddq ymm17,ymm17,ymm24 + vpaddq ymm18,ymm18,ymm25 + vpaddq ymm19,ymm19,ymm26 + vpaddq ymm20,ymm20,ymm27 + vpaddq ymm21,ymm21,ymm28 + vpaddq ymm22,ymm22,ymm29 + + + + vpcmpuq k1,ymm13,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm14,YMMWORD[$L$mask52x4],6 + kmovb r14d,k1 + kmovb r13d,k2 + shl r13b,4 + or r14b,r13b + + vpcmpuq k1,ymm15,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm16,YMMWORD[$L$mask52x4],6 + kmovb r13d,k1 + kmovb r12d,k2 + shl r12b,4 + or r13b,r12b + + vpcmpuq k1,ymm17,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm18,YMMWORD[$L$mask52x4],6 + kmovb r12d,k1 + kmovb r11d,k2 + shl r11b,4 + or r12b,r11b + + vpcmpuq k1,ymm19,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm20,YMMWORD[$L$mask52x4],6 + kmovb r11d,k1 + kmovb r10d,k2 + shl r10b,4 + or r11b,r10b + + vpcmpuq k1,ymm21,YMMWORD[$L$mask52x4],6 + vpcmpuq k2,ymm22,YMMWORD[$L$mask52x4],6 + kmovb r10d,k1 + kmovb r9d,k2 + shl r9b,4 + or r10b,r9b + + add r14b,r14b + adc r13b,r13b + adc r12b,r12b + adc r11b,r11b + adc r10b,r10b + + + vpcmpuq k1,ymm13,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm14,YMMWORD[$L$mask52x4],0 + kmovb r9d,k1 + kmovb r8d,k2 + shl r8b,4 + or r9b,r8b + + vpcmpuq k1,ymm15,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm16,YMMWORD[$L$mask52x4],0 + kmovb r8d,k1 + kmovb edx,k2 + shl dl,4 + or r8b,dl + + vpcmpuq k1,ymm17,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm18,YMMWORD[$L$mask52x4],0 + kmovb edx,k1 + kmovb ecx,k2 + shl cl,4 + or dl,cl + + vpcmpuq k1,ymm19,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm20,YMMWORD[$L$mask52x4],0 + kmovb ecx,k1 + kmovb ebx,k2 + shl bl,4 + or cl,bl + + vpcmpuq k1,ymm21,YMMWORD[$L$mask52x4],0 + vpcmpuq k2,ymm22,YMMWORD[$L$mask52x4],0 + kmovb ebx,k1 + kmovb eax,k2 + shl al,4 + or bl,al + + add r14b,r9b + adc r13b,r8b + adc r12b,dl + adc r11b,cl + adc r10b,bl + + xor r14b,r9b + xor r13b,r8b + xor r12b,dl + xor r11b,cl + xor r10b,bl + + kmovb k1,r14d + shr r14b,4 + kmovb k2,r14d + kmovb k3,r13d + shr r13b,4 + kmovb k4,r13d + kmovb k5,r12d + shr r12b,4 + kmovb k6,r12d + kmovb k7,r11d + + vpsubq ymm13{k1},ymm13,YMMWORD[$L$mask52x4] + vpsubq ymm14{k2},ymm14,YMMWORD[$L$mask52x4] + vpsubq ymm15{k3},ymm15,YMMWORD[$L$mask52x4] + vpsubq ymm16{k4},ymm16,YMMWORD[$L$mask52x4] + vpsubq ymm17{k5},ymm17,YMMWORD[$L$mask52x4] + vpsubq ymm18{k6},ymm18,YMMWORD[$L$mask52x4] + vpsubq ymm19{k7},ymm19,YMMWORD[$L$mask52x4] + + vpandq ymm13,ymm13,YMMWORD[$L$mask52x4] + vpandq ymm14,ymm14,YMMWORD[$L$mask52x4] + vpandq ymm15,ymm15,YMMWORD[$L$mask52x4] + vpandq ymm16,ymm16,YMMWORD[$L$mask52x4] + vpandq ymm17,ymm17,YMMWORD[$L$mask52x4] + vpandq ymm18,ymm18,YMMWORD[$L$mask52x4] + vpandq ymm19,ymm19,YMMWORD[$L$mask52x4] + + shr r11b,4 + kmovb k1,r11d + kmovb k2,r10d + shr r10b,4 + kmovb k3,r10d + + vpsubq ymm20{k1},ymm20,YMMWORD[$L$mask52x4] + vpsubq ymm21{k2},ymm21,YMMWORD[$L$mask52x4] + vpsubq ymm22{k3},ymm22,YMMWORD[$L$mask52x4] + + vpandq ymm20,ymm20,YMMWORD[$L$mask52x4] + vpandq ymm21,ymm21,YMMWORD[$L$mask52x4] + vpandq ymm22,ymm22,YMMWORD[$L$mask52x4] + + vmovdqu64 YMMWORD[rdi],ymm3 + vmovdqu64 YMMWORD[32+rdi],ymm4 + vmovdqu64 YMMWORD[64+rdi],ymm5 + vmovdqu64 YMMWORD[96+rdi],ymm6 + vmovdqu64 YMMWORD[128+rdi],ymm7 + vmovdqu64 YMMWORD[160+rdi],ymm8 + vmovdqu64 YMMWORD[192+rdi],ymm9 + vmovdqu64 YMMWORD[224+rdi],ymm10 + vmovdqu64 YMMWORD[256+rdi],ymm11 + vmovdqu64 YMMWORD[288+rdi],ymm12 + + vmovdqu64 YMMWORD[320+rdi],ymm13 + vmovdqu64 YMMWORD[352+rdi],ymm14 + vmovdqu64 YMMWORD[384+rdi],ymm15 + vmovdqu64 YMMWORD[416+rdi],ymm16 + vmovdqu64 YMMWORD[448+rdi],ymm17 + vmovdqu64 YMMWORD[480+rdi],ymm18 + vmovdqu64 YMMWORD[512+rdi],ymm19 + vmovdqu64 YMMWORD[544+rdi],ymm20 + vmovdqu64 YMMWORD[576+rdi],ymm21 + vmovdqu64 YMMWORD[608+rdi],ymm22 + + vzeroupper + lea rax,[rsp] + + vmovdqa64 xmm6,XMMWORD[rax] + vmovdqa64 xmm7,XMMWORD[16+rax] + vmovdqa64 xmm8,XMMWORD[32+rax] + vmovdqa64 xmm9,XMMWORD[48+rax] + vmovdqa64 xmm10,XMMWORD[64+rax] + vmovdqa64 xmm11,XMMWORD[80+rax] + vmovdqa64 xmm12,XMMWORD[96+rax] + vmovdqa64 xmm13,XMMWORD[112+rax] + vmovdqa64 xmm14,XMMWORD[128+rax] + vmovdqa64 xmm15,XMMWORD[144+rax] + lea rax,[168+rsp] + mov r15,QWORD[rax] + + mov r14,QWORD[8+rax] + + mov r13,QWORD[16+rax] + + mov r12,QWORD[24+rax] + + mov rbp,QWORD[32+rax] + + mov rbx,QWORD[40+rax] + + lea rsp,[48+rax] + +$L$ossl_rsaz_amm52x40_x2_ifma256_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ossl_rsaz_amm52x40_x2_ifma256: +section .text code align=64 + + +ALIGN 32 +global ossl_extract_multiplier_2x40_win5 + +ossl_extract_multiplier_2x40_win5: + +DB 243,15,30,250 + vmovdqa64 ymm24,YMMWORD[$L$ones] + vpbroadcastq ymm22,r8 + vpbroadcastq ymm23,r9 + lea rax,[20480+rdx] + + + mov r10,rdx + + + vpxor xmm0,xmm0,xmm0 + vmovdqa64 ymm1,ymm0 + vmovdqa64 ymm2,ymm0 + vmovdqa64 ymm3,ymm0 + vmovdqa64 ymm4,ymm0 + vmovdqa64 ymm5,ymm0 + vmovdqa64 ymm16,ymm0 + vmovdqa64 ymm17,ymm0 + vmovdqa64 ymm18,ymm0 + vmovdqa64 ymm19,ymm0 + vpxorq ymm21,ymm21,ymm21 +ALIGN 32 +$L$loop_0: + vpcmpq k1,ymm22,ymm21,0 + vmovdqu64 ymm20,YMMWORD[rdx] + vpblendmq ymm0{k1},ymm0,ymm20 + vmovdqu64 ymm20,YMMWORD[32+rdx] + vpblendmq ymm1{k1},ymm1,ymm20 + vmovdqu64 ymm20,YMMWORD[64+rdx] + vpblendmq ymm2{k1},ymm2,ymm20 + vmovdqu64 ymm20,YMMWORD[96+rdx] + vpblendmq ymm3{k1},ymm3,ymm20 + vmovdqu64 ymm20,YMMWORD[128+rdx] + vpblendmq ymm4{k1},ymm4,ymm20 + vmovdqu64 ymm20,YMMWORD[160+rdx] + vpblendmq ymm5{k1},ymm5,ymm20 + vmovdqu64 ymm20,YMMWORD[192+rdx] + vpblendmq ymm16{k1},ymm16,ymm20 + vmovdqu64 ymm20,YMMWORD[224+rdx] + vpblendmq ymm17{k1},ymm17,ymm20 + vmovdqu64 ymm20,YMMWORD[256+rdx] + vpblendmq ymm18{k1},ymm18,ymm20 + vmovdqu64 ymm20,YMMWORD[288+rdx] + vpblendmq ymm19{k1},ymm19,ymm20 + vpaddq ymm21,ymm21,ymm24 + add rdx,640 + cmp rax,rdx + jne NEAR $L$loop_0 + vmovdqu64 YMMWORD[rcx],ymm0 + vmovdqu64 YMMWORD[32+rcx],ymm1 + vmovdqu64 YMMWORD[64+rcx],ymm2 + vmovdqu64 YMMWORD[96+rcx],ymm3 + vmovdqu64 YMMWORD[128+rcx],ymm4 + vmovdqu64 YMMWORD[160+rcx],ymm5 + vmovdqu64 YMMWORD[192+rcx],ymm16 + vmovdqu64 YMMWORD[224+rcx],ymm17 + vmovdqu64 YMMWORD[256+rcx],ymm18 + vmovdqu64 YMMWORD[288+rcx],ymm19 + mov rdx,r10 + vpxorq ymm21,ymm21,ymm21 +ALIGN 32 +$L$loop_320: + vpcmpq k1,ymm23,ymm21,0 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpblendmq ymm0{k1},ymm0,ymm20 + vmovdqu64 ymm20,YMMWORD[352+rdx] + vpblendmq ymm1{k1},ymm1,ymm20 + vmovdqu64 ymm20,YMMWORD[384+rdx] + vpblendmq ymm2{k1},ymm2,ymm20 + vmovdqu64 ymm20,YMMWORD[416+rdx] + vpblendmq ymm3{k1},ymm3,ymm20 + vmovdqu64 ymm20,YMMWORD[448+rdx] + vpblendmq ymm4{k1},ymm4,ymm20 + vmovdqu64 ymm20,YMMWORD[480+rdx] + vpblendmq ymm5{k1},ymm5,ymm20 + vmovdqu64 ymm20,YMMWORD[512+rdx] + vpblendmq ymm16{k1},ymm16,ymm20 + vmovdqu64 ymm20,YMMWORD[544+rdx] + vpblendmq ymm17{k1},ymm17,ymm20 + vmovdqu64 ymm20,YMMWORD[576+rdx] + vpblendmq ymm18{k1},ymm18,ymm20 + vmovdqu64 ymm20,YMMWORD[608+rdx] + vpblendmq ymm19{k1},ymm19,ymm20 + vpaddq ymm21,ymm21,ymm24 + add rdx,640 + cmp rax,rdx + jne NEAR $L$loop_320 + vmovdqu64 YMMWORD[320+rcx],ymm0 + vmovdqu64 YMMWORD[352+rcx],ymm1 + vmovdqu64 YMMWORD[384+rcx],ymm2 + vmovdqu64 YMMWORD[416+rcx],ymm3 + vmovdqu64 YMMWORD[448+rcx],ymm4 + vmovdqu64 YMMWORD[480+rcx],ymm5 + vmovdqu64 YMMWORD[512+rcx],ymm16 + vmovdqu64 YMMWORD[544+rcx],ymm17 + vmovdqu64 YMMWORD[576+rcx],ymm18 + vmovdqu64 YMMWORD[608+rcx],ymm19 + + DB 0F3h,0C3h ;repret + + +section .rdata rdata align=32 +ALIGN 32 +$L$ones: + DQ 1,1,1,1 +$L$zeros: + DQ 0,0,0,0 +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +rsaz_avx_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + lea rsi,[rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + + lea rax,[216+rax] + + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_ossl_rsaz_amm52x40_x1_ifma256 wrt ..imagebase + DD $L$SEH_end_ossl_rsaz_amm52x40_x1_ifma256 wrt ..imagebase + DD $L$SEH_info_ossl_rsaz_amm52x40_x1_ifma256 wrt ..imagebase + + DD $L$SEH_begin_ossl_rsaz_amm52x40_x2_ifma256 wrt ..imagebase + DD $L$SEH_end_ossl_rsaz_amm52x40_x2_ifma256 wrt ..imagebase + DD $L$SEH_info_ossl_rsaz_amm52x40_x2_ifma256 wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_ossl_rsaz_amm52x40_x1_ifma256: +DB 9,0,0,0 + DD rsaz_avx_handler wrt ..imagebase + DD $L$ossl_rsaz_amm52x40_x1_ifma256_body wrt ..imagebase,$L$ossl_rsaz_amm52x40_x1_ifma256_epilogue wrt ..imagebase +$L$SEH_info_ossl_rsaz_amm52x40_x2_ifma256: +DB 9,0,0,0 + DD rsaz_avx_handler wrt ..imagebase + DD $L$ossl_rsaz_amm52x40_x2_ifma256_body wrt ..imagebase,$L$ossl_rsaz_amm52x40_x2_ifma256_epilogue wrt ..imagebase diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-avx2.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-avx2.nasm index 7342e16c22c5..bc74ee2e0ba9 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-avx2.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-avx2.nasm @@ -1832,6 +1832,7 @@ rsaz_avx2_eligible: DB 0F3h,0C3h ;repret +section .rdata rdata align=64 ALIGN 64 $L$and_mask: DQ 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff @@ -1843,6 +1844,7 @@ $L$inc: DD 0,0,0,0,1,1,1,1 DD 2,2,2,2,3,3,3,3 DD 4,4,4,4,4,4,4,4 +section .text ALIGN 64 EXTERN __imp_RtlVirtualUnwind diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-x86_64.nasm index f407312e9502..047a8f94ccbf 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-x86_64.nasm @@ -2094,10 +2094,12 @@ $L$SEH_end_rsaz_512_gather4: +section .rdata rdata align=64 ALIGN 64 $L$inc: DD 0,0,1,1 DD 2,2,2,2 +section .text EXTERN __imp_RtlVirtualUnwind ALIGN 16 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/x86_64-mont5.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/x86_64-mont5.nasm index 260113b01761..e6ca669126db 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/x86_64-mont5.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/x86_64-mont5.nasm @@ -3671,6 +3671,7 @@ $L$gather: $L$SEH_end_bn_gather5: +section .rdata rdata align=64 ALIGN 64 $L$inc: DD 0,0,1,1 @@ -3681,6 +3682,7 @@ DB 99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111 DB 114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79 DB 71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111 DB 112,101,110,115,115,108,46,111,114,103,62,0 +section .text EXTERN __imp_RtlVirtualUnwind ALIGN 16 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/ec/ecp_nistz256-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/ec/ecp_nistz256-x86_64.nasm index b35e99bc90b0..6b01061d9754 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/ec/ecp_nistz256-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/ec/ecp_nistz256-x86_64.nasm @@ -2,8 +2,7 @@ default rel %define XMMWORD %define YMMWORD %define ZMMWORD -section .text code align=64 - +section .rdata rdata align=4096 global ecp_nistz256_precomputed ALIGN 4096 @@ -2382,6 +2381,7 @@ section .text code align=64 EXTERN OPENSSL_ia32cap_P +section .rdata rdata align=4096 ALIGN 64 $L$poly: DQ 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 @@ -2404,6 +2404,7 @@ $L$ord: DQ 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 $L$ordK: DQ 0xccd1c8aaee00bc4f +section .text global ecp_nistz256_mul_by_2 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/aes-gcm-avx512.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/aes-gcm-avx512.nasm new file mode 100644 index 000000000000..f4f6398e2225 --- /dev/null +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/aes-gcm-avx512.nasm @@ -0,0 +1,136520 @@ +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +EXTERN OPENSSL_ia32cap_P +global ossl_vaes_vpclmulqdq_capable + +ALIGN 32 +ossl_vaes_vpclmulqdq_capable: + mov rcx,QWORD[((OPENSSL_ia32cap_P+8))] + + mov rdx,6600291188736 + xor eax,eax + and rcx,rdx + cmp rcx,rdx + cmove rax,rcx + DB 0F3h,0C3h ;repret + +section .text code align=64 + +global ossl_aes_gcm_init_avx512 + +ALIGN 32 +ossl_aes_gcm_init_avx512: + +DB 243,15,30,250 + vpxorq xmm16,xmm16,xmm16 + + + mov eax,DWORD[240+rcx] + cmp eax,9 + je NEAR $L$aes_128_0 + cmp eax,11 + je NEAR $L$aes_192_0 + cmp eax,13 + je NEAR $L$aes_256_0 + jmp NEAR $L$exit_aes_0 +ALIGN 32 +$L$aes_128_0: + vpxorq xmm16,xmm16,XMMWORD[rcx] + + vaesenc xmm16,xmm16,XMMWORD[16+rcx] + + vaesenc xmm16,xmm16,XMMWORD[32+rcx] + + vaesenc xmm16,xmm16,XMMWORD[48+rcx] + + vaesenc xmm16,xmm16,XMMWORD[64+rcx] + + vaesenc xmm16,xmm16,XMMWORD[80+rcx] + + vaesenc xmm16,xmm16,XMMWORD[96+rcx] + + vaesenc xmm16,xmm16,XMMWORD[112+rcx] + + vaesenc xmm16,xmm16,XMMWORD[128+rcx] + + vaesenc xmm16,xmm16,XMMWORD[144+rcx] + + vaesenclast xmm16,xmm16,XMMWORD[160+rcx] + jmp NEAR $L$exit_aes_0 +ALIGN 32 +$L$aes_192_0: + vpxorq xmm16,xmm16,XMMWORD[rcx] + + vaesenc xmm16,xmm16,XMMWORD[16+rcx] + + vaesenc xmm16,xmm16,XMMWORD[32+rcx] + + vaesenc xmm16,xmm16,XMMWORD[48+rcx] + + vaesenc xmm16,xmm16,XMMWORD[64+rcx] + + vaesenc xmm16,xmm16,XMMWORD[80+rcx] + + vaesenc xmm16,xmm16,XMMWORD[96+rcx] + + vaesenc xmm16,xmm16,XMMWORD[112+rcx] + + vaesenc xmm16,xmm16,XMMWORD[128+rcx] + + vaesenc xmm16,xmm16,XMMWORD[144+rcx] + + vaesenc xmm16,xmm16,XMMWORD[160+rcx] + + vaesenc xmm16,xmm16,XMMWORD[176+rcx] + + vaesenclast xmm16,xmm16,XMMWORD[192+rcx] + jmp NEAR $L$exit_aes_0 +ALIGN 32 +$L$aes_256_0: + vpxorq xmm16,xmm16,XMMWORD[rcx] + + vaesenc xmm16,xmm16,XMMWORD[16+rcx] + + vaesenc xmm16,xmm16,XMMWORD[32+rcx] + + vaesenc xmm16,xmm16,XMMWORD[48+rcx] + + vaesenc xmm16,xmm16,XMMWORD[64+rcx] + + vaesenc xmm16,xmm16,XMMWORD[80+rcx] + + vaesenc xmm16,xmm16,XMMWORD[96+rcx] + + vaesenc xmm16,xmm16,XMMWORD[112+rcx] + + vaesenc xmm16,xmm16,XMMWORD[128+rcx] + + vaesenc xmm16,xmm16,XMMWORD[144+rcx] + + vaesenc xmm16,xmm16,XMMWORD[160+rcx] + + vaesenc xmm16,xmm16,XMMWORD[176+rcx] + + vaesenc xmm16,xmm16,XMMWORD[192+rcx] + + vaesenc xmm16,xmm16,XMMWORD[208+rcx] + + vaesenclast xmm16,xmm16,XMMWORD[224+rcx] + jmp NEAR $L$exit_aes_0 +$L$exit_aes_0: + + vpshufb xmm16,xmm16,XMMWORD[SHUF_MASK] + + vmovdqa64 xmm2,xmm16 + vpsllq xmm16,xmm16,1 + vpsrlq xmm2,xmm2,63 + vmovdqa xmm1,xmm2 + vpslldq xmm2,xmm2,8 + vpsrldq xmm1,xmm1,8 + vporq xmm16,xmm16,xmm2 + + vpshufd xmm2,xmm1,36 + vpcmpeqd xmm2,xmm2,XMMWORD[TWOONE] + vpand xmm2,xmm2,XMMWORD[POLY] + vpxorq xmm16,xmm16,xmm2 + + vmovdqu64 XMMWORD[336+rdx],xmm16 + vshufi32x4 ymm4,ymm16,ymm16,0x00 + vmovdqa ymm3,ymm4 + + vpclmulqdq ymm0,ymm3,ymm4,0x11 + vpclmulqdq ymm1,ymm3,ymm4,0x00 + vpclmulqdq ymm2,ymm3,ymm4,0x01 + vpclmulqdq ymm3,ymm3,ymm4,0x10 + vpxorq ymm3,ymm3,ymm2 + + vpsrldq ymm2,ymm3,8 + vpslldq ymm3,ymm3,8 + vpxorq ymm0,ymm0,ymm2 + vpxorq ymm3,ymm3,ymm1 + + + + vmovdqu64 ymm2,YMMWORD[POLY2] + + vpclmulqdq ymm1,ymm2,ymm3,0x01 + vpslldq ymm1,ymm1,8 + vpxorq ymm3,ymm3,ymm1 + + + + vpclmulqdq ymm1,ymm2,ymm3,0x00 + vpsrldq ymm1,ymm1,4 + vpclmulqdq ymm3,ymm2,ymm3,0x10 + vpslldq ymm3,ymm3,4 + + vpternlogq ymm3,ymm0,ymm1,0x96 + + vmovdqu64 XMMWORD[320+rdx],xmm3 + vinserti64x2 ymm4,ymm3,xmm16,1 + vmovdqa64 ymm5,ymm4 + + vpclmulqdq ymm0,ymm4,ymm3,0x11 + vpclmulqdq ymm1,ymm4,ymm3,0x00 + vpclmulqdq ymm2,ymm4,ymm3,0x01 + vpclmulqdq ymm4,ymm4,ymm3,0x10 + vpxorq ymm4,ymm4,ymm2 + + vpsrldq ymm2,ymm4,8 + vpslldq ymm4,ymm4,8 + vpxorq ymm0,ymm0,ymm2 + vpxorq ymm4,ymm4,ymm1 + + + + vmovdqu64 ymm2,YMMWORD[POLY2] + + vpclmulqdq ymm1,ymm2,ymm4,0x01 + vpslldq ymm1,ymm1,8 + vpxorq ymm4,ymm4,ymm1 + + + + vpclmulqdq ymm1,ymm2,ymm4,0x00 + vpsrldq ymm1,ymm1,4 + vpclmulqdq ymm4,ymm2,ymm4,0x10 + vpslldq ymm4,ymm4,4 + + vpternlogq ymm4,ymm0,ymm1,0x96 + + vmovdqu64 YMMWORD[288+rdx],ymm4 + + vinserti64x4 zmm4,zmm4,ymm5,1 + + + vshufi64x2 zmm3,zmm4,zmm4,0x00 + vmovdqa64 zmm5,zmm4 + + vpclmulqdq zmm0,zmm4,zmm3,0x11 + vpclmulqdq zmm1,zmm4,zmm3,0x00 + vpclmulqdq zmm2,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm2 + + vpsrldq zmm2,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm0,zmm0,zmm2 + vpxorq zmm4,zmm4,zmm1 + + + + vmovdqu64 zmm2,ZMMWORD[POLY2] + + vpclmulqdq zmm1,zmm2,zmm4,0x01 + vpslldq zmm1,zmm1,8 + vpxorq zmm4,zmm4,zmm1 + + + + vpclmulqdq zmm1,zmm2,zmm4,0x00 + vpsrldq zmm1,zmm1,4 + vpclmulqdq zmm4,zmm2,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm0,zmm1,0x96 + + vmovdqu64 ZMMWORD[224+rdx],zmm4 + vshufi64x2 zmm3,zmm4,zmm4,0x00 + + vpclmulqdq zmm0,zmm5,zmm3,0x11 + vpclmulqdq zmm1,zmm5,zmm3,0x00 + vpclmulqdq zmm2,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm2 + + vpsrldq zmm2,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm0,zmm0,zmm2 + vpxorq zmm5,zmm5,zmm1 + + + + vmovdqu64 zmm2,ZMMWORD[POLY2] + + vpclmulqdq zmm1,zmm2,zmm5,0x01 + vpslldq zmm1,zmm1,8 + vpxorq zmm5,zmm5,zmm1 + + + + vpclmulqdq zmm1,zmm2,zmm5,0x00 + vpsrldq zmm1,zmm1,4 + vpclmulqdq zmm5,zmm2,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm0,zmm1,0x96 + + vmovdqu64 ZMMWORD[160+rdx],zmm5 + + vpclmulqdq zmm0,zmm4,zmm3,0x11 + vpclmulqdq zmm1,zmm4,zmm3,0x00 + vpclmulqdq zmm2,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm2 + + vpsrldq zmm2,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm0,zmm0,zmm2 + vpxorq zmm4,zmm4,zmm1 + + + + vmovdqu64 zmm2,ZMMWORD[POLY2] + + vpclmulqdq zmm1,zmm2,zmm4,0x01 + vpslldq zmm1,zmm1,8 + vpxorq zmm4,zmm4,zmm1 + + + + vpclmulqdq zmm1,zmm2,zmm4,0x00 + vpsrldq zmm1,zmm1,4 + vpclmulqdq zmm4,zmm2,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm0,zmm1,0x96 + + vmovdqu64 ZMMWORD[96+rdx],zmm4 + vzeroupper +$L$abort_init: + DB 0F3h,0C3h ;repret + + +global ossl_aes_gcm_setiv_avx512 + +ALIGN 32 +ossl_aes_gcm_setiv_avx512: + +$L$setiv_seh_begin: +DB 243,15,30,250 + push rbx + +$L$setiv_seh_push_rbx: + push rbp + +$L$setiv_seh_push_rbp: + push r12 + +$L$setiv_seh_push_r12: + push r13 + +$L$setiv_seh_push_r13: + push r14 + +$L$setiv_seh_push_r14: + push r15 + +$L$setiv_seh_push_r15: + push rdi +$L$setiv_seh_push_rdi: + push rsi +$L$setiv_seh_push_rsi: + + sub rsp,168 +$L$setiv_seh_allocstack_xmm: + + + + + + + + + + + lea rbp,[160+rsp] + +$L$setiv_seh_setfp: + vmovdqu XMMWORD[rsp],xmm6 +$L$setiv_seh_save_xmm6: + vmovdqu XMMWORD[16+rsp],xmm7 +$L$setiv_seh_save_xmm7: + vmovdqu XMMWORD[32+rsp],xmm8 +$L$setiv_seh_save_xmm8: + vmovdqu XMMWORD[48+rsp],xmm9 +$L$setiv_seh_save_xmm9: + vmovdqu XMMWORD[64+rsp],xmm10 +$L$setiv_seh_save_xmm10: + vmovdqu XMMWORD[80+rsp],xmm11 +$L$setiv_seh_save_xmm11: + vmovdqu XMMWORD[96+rsp],xmm12 +$L$setiv_seh_save_xmm12: + vmovdqu XMMWORD[112+rsp],xmm13 +$L$setiv_seh_save_xmm13: + vmovdqu XMMWORD[128+rsp],xmm14 +$L$setiv_seh_save_xmm14: + vmovdqu XMMWORD[144+rsp],xmm15 +$L$setiv_seh_save_xmm15: + +$L$setiv_seh_prolog_end: + sub rsp,816 + and rsp,(-64) + cmp r9,12 + je NEAR iv_len_12_init_IV + vpxor xmm2,xmm2,xmm2 + mov r10,r8 + mov r11,r9 + or r11,r11 + jz NEAR $L$_CALC_AAD_done_1 + + xor rbx,rbx + vmovdqa64 zmm16,ZMMWORD[SHUF_MASK] + +$L$_get_AAD_loop48x16_1: + cmp r11,768 + jl NEAR $L$_exit_AAD_loop48x16_1 + vmovdqu64 zmm11,ZMMWORD[r10] + vmovdqu64 zmm3,ZMMWORD[64+r10] + vmovdqu64 zmm4,ZMMWORD[128+r10] + vmovdqu64 zmm5,ZMMWORD[192+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpshufb zmm5,zmm5,zmm16 + test rbx,rbx + jnz NEAR $L$_skip_hkeys_precomputation_2 + + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vmovdqu64 ZMMWORD[704+rsp],zmm1 + + vmovdqu64 zmm9,ZMMWORD[224+rdx] + vmovdqu64 ZMMWORD[640+rsp],zmm9 + + + vshufi64x2 zmm9,zmm9,zmm9,0x00 + + vmovdqu64 zmm10,ZMMWORD[160+rdx] + vmovdqu64 ZMMWORD[576+rsp],zmm10 + + vmovdqu64 zmm12,ZMMWORD[96+rdx] + vmovdqu64 ZMMWORD[512+rsp],zmm12 + + vpclmulqdq zmm13,zmm10,zmm9,0x11 + vpclmulqdq zmm15,zmm10,zmm9,0x00 + vpclmulqdq zmm17,zmm10,zmm9,0x01 + vpclmulqdq zmm10,zmm10,zmm9,0x10 + vpxorq zmm10,zmm10,zmm17 + + vpsrldq zmm17,zmm10,8 + vpslldq zmm10,zmm10,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm10,zmm10,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm10,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm10,zmm10,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm10,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm10,zmm17,zmm10,0x10 + vpslldq zmm10,zmm10,4 + + vpternlogq zmm10,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[448+rsp],zmm10 + + vpclmulqdq zmm13,zmm12,zmm9,0x11 + vpclmulqdq zmm15,zmm12,zmm9,0x00 + vpclmulqdq zmm17,zmm12,zmm9,0x01 + vpclmulqdq zmm12,zmm12,zmm9,0x10 + vpxorq zmm12,zmm12,zmm17 + + vpsrldq zmm17,zmm12,8 + vpslldq zmm12,zmm12,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm12,zmm12,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm12,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm12,zmm12,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm12,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm12,zmm17,zmm12,0x10 + vpslldq zmm12,zmm12,4 + + vpternlogq zmm12,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[384+rsp],zmm12 + + vpclmulqdq zmm13,zmm10,zmm9,0x11 + vpclmulqdq zmm15,zmm10,zmm9,0x00 + vpclmulqdq zmm17,zmm10,zmm9,0x01 + vpclmulqdq zmm10,zmm10,zmm9,0x10 + vpxorq zmm10,zmm10,zmm17 + + vpsrldq zmm17,zmm10,8 + vpslldq zmm10,zmm10,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm10,zmm10,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm10,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm10,zmm10,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm10,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm10,zmm17,zmm10,0x10 + vpslldq zmm10,zmm10,4 + + vpternlogq zmm10,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[320+rsp],zmm10 + + vpclmulqdq zmm13,zmm12,zmm9,0x11 + vpclmulqdq zmm15,zmm12,zmm9,0x00 + vpclmulqdq zmm17,zmm12,zmm9,0x01 + vpclmulqdq zmm12,zmm12,zmm9,0x10 + vpxorq zmm12,zmm12,zmm17 + + vpsrldq zmm17,zmm12,8 + vpslldq zmm12,zmm12,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm12,zmm12,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm12,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm12,zmm12,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm12,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm12,zmm17,zmm12,0x10 + vpslldq zmm12,zmm12,4 + + vpternlogq zmm12,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[256+rsp],zmm12 + + vpclmulqdq zmm13,zmm10,zmm9,0x11 + vpclmulqdq zmm15,zmm10,zmm9,0x00 + vpclmulqdq zmm17,zmm10,zmm9,0x01 + vpclmulqdq zmm10,zmm10,zmm9,0x10 + vpxorq zmm10,zmm10,zmm17 + + vpsrldq zmm17,zmm10,8 + vpslldq zmm10,zmm10,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm10,zmm10,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm10,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm10,zmm10,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm10,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm10,zmm17,zmm10,0x10 + vpslldq zmm10,zmm10,4 + + vpternlogq zmm10,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[192+rsp],zmm10 + + vpclmulqdq zmm13,zmm12,zmm9,0x11 + vpclmulqdq zmm15,zmm12,zmm9,0x00 + vpclmulqdq zmm17,zmm12,zmm9,0x01 + vpclmulqdq zmm12,zmm12,zmm9,0x10 + vpxorq zmm12,zmm12,zmm17 + + vpsrldq zmm17,zmm12,8 + vpslldq zmm12,zmm12,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm12,zmm12,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm12,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm12,zmm12,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm12,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm12,zmm17,zmm12,0x10 + vpslldq zmm12,zmm12,4 + + vpternlogq zmm12,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[128+rsp],zmm12 + + vpclmulqdq zmm13,zmm10,zmm9,0x11 + vpclmulqdq zmm15,zmm10,zmm9,0x00 + vpclmulqdq zmm17,zmm10,zmm9,0x01 + vpclmulqdq zmm10,zmm10,zmm9,0x10 + vpxorq zmm10,zmm10,zmm17 + + vpsrldq zmm17,zmm10,8 + vpslldq zmm10,zmm10,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm10,zmm10,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm10,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm10,zmm10,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm10,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm10,zmm17,zmm10,0x10 + vpslldq zmm10,zmm10,4 + + vpternlogq zmm10,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[64+rsp],zmm10 + + vpclmulqdq zmm13,zmm12,zmm9,0x11 + vpclmulqdq zmm15,zmm12,zmm9,0x00 + vpclmulqdq zmm17,zmm12,zmm9,0x01 + vpclmulqdq zmm12,zmm12,zmm9,0x10 + vpxorq zmm12,zmm12,zmm17 + + vpsrldq zmm17,zmm12,8 + vpslldq zmm12,zmm12,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm12,zmm12,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm12,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm12,zmm12,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm12,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm12,zmm17,zmm12,0x10 + vpslldq zmm12,zmm12,4 + + vpternlogq zmm12,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[rsp],zmm12 +$L$_skip_hkeys_precomputation_2: + mov rbx,1 + vpxorq zmm11,zmm11,zmm2 + vmovdqu64 zmm19,ZMMWORD[rsp] + vpclmulqdq zmm1,zmm11,zmm19,0x11 + vpclmulqdq zmm9,zmm11,zmm19,0x00 + vpclmulqdq zmm10,zmm11,zmm19,0x01 + vpclmulqdq zmm12,zmm11,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[64+rsp] + vpclmulqdq zmm13,zmm3,zmm19,0x11 + vpclmulqdq zmm15,zmm3,zmm19,0x00 + vpclmulqdq zmm17,zmm3,zmm19,0x01 + vpclmulqdq zmm18,zmm3,zmm19,0x10 + vpxorq zmm7,zmm10,zmm17 + vpxorq zmm6,zmm1,zmm13 + vpxorq zmm8,zmm9,zmm15 + vpternlogq zmm7,zmm12,zmm18,0x96 + vmovdqu64 zmm19,ZMMWORD[128+rsp] + vpclmulqdq zmm1,zmm4,zmm19,0x11 + vpclmulqdq zmm9,zmm4,zmm19,0x00 + vpclmulqdq zmm10,zmm4,zmm19,0x01 + vpclmulqdq zmm12,zmm4,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[192+rsp] + vpclmulqdq zmm13,zmm5,zmm19,0x11 + vpclmulqdq zmm15,zmm5,zmm19,0x00 + vpclmulqdq zmm17,zmm5,zmm19,0x01 + vpclmulqdq zmm18,zmm5,zmm19,0x10 + + vpternlogq zmm7,zmm10,zmm17,0x96 + vpternlogq zmm6,zmm1,zmm13,0x96 + vpternlogq zmm8,zmm9,zmm15,0x96 + vpternlogq zmm7,zmm12,zmm18,0x96 + vmovdqu64 zmm11,ZMMWORD[256+r10] + vmovdqu64 zmm3,ZMMWORD[320+r10] + vmovdqu64 zmm4,ZMMWORD[384+r10] + vmovdqu64 zmm5,ZMMWORD[448+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpshufb zmm5,zmm5,zmm16 + vmovdqu64 zmm19,ZMMWORD[256+rsp] + vpclmulqdq zmm1,zmm11,zmm19,0x11 + vpclmulqdq zmm9,zmm11,zmm19,0x00 + vpclmulqdq zmm10,zmm11,zmm19,0x01 + vpclmulqdq zmm12,zmm11,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[320+rsp] + vpclmulqdq zmm13,zmm3,zmm19,0x11 + vpclmulqdq zmm15,zmm3,zmm19,0x00 + vpclmulqdq zmm17,zmm3,zmm19,0x01 + vpclmulqdq zmm18,zmm3,zmm19,0x10 + vpternlogq zmm7,zmm10,zmm17,0x96 + vpternlogq zmm6,zmm1,zmm13,0x96 + vpternlogq zmm8,zmm9,zmm15,0x96 + vpternlogq zmm7,zmm12,zmm18,0x96 + vmovdqu64 zmm19,ZMMWORD[384+rsp] + vpclmulqdq zmm1,zmm4,zmm19,0x11 + vpclmulqdq zmm9,zmm4,zmm19,0x00 + vpclmulqdq zmm10,zmm4,zmm19,0x01 + vpclmulqdq zmm12,zmm4,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[448+rsp] + vpclmulqdq zmm13,zmm5,zmm19,0x11 + vpclmulqdq zmm15,zmm5,zmm19,0x00 + vpclmulqdq zmm17,zmm5,zmm19,0x01 + vpclmulqdq zmm18,zmm5,zmm19,0x10 + + vpternlogq zmm7,zmm10,zmm17,0x96 + vpternlogq zmm6,zmm1,zmm13,0x96 + vpternlogq zmm8,zmm9,zmm15,0x96 + vpternlogq zmm7,zmm12,zmm18,0x96 + vmovdqu64 zmm11,ZMMWORD[512+r10] + vmovdqu64 zmm3,ZMMWORD[576+r10] + vmovdqu64 zmm4,ZMMWORD[640+r10] + vmovdqu64 zmm5,ZMMWORD[704+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpshufb zmm5,zmm5,zmm16 + vmovdqu64 zmm19,ZMMWORD[512+rsp] + vpclmulqdq zmm1,zmm11,zmm19,0x11 + vpclmulqdq zmm9,zmm11,zmm19,0x00 + vpclmulqdq zmm10,zmm11,zmm19,0x01 + vpclmulqdq zmm12,zmm11,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[576+rsp] + vpclmulqdq zmm13,zmm3,zmm19,0x11 + vpclmulqdq zmm15,zmm3,zmm19,0x00 + vpclmulqdq zmm17,zmm3,zmm19,0x01 + vpclmulqdq zmm18,zmm3,zmm19,0x10 + vpternlogq zmm7,zmm10,zmm17,0x96 + vpternlogq zmm6,zmm1,zmm13,0x96 + vpternlogq zmm8,zmm9,zmm15,0x96 + vpternlogq zmm7,zmm12,zmm18,0x96 + vmovdqu64 zmm19,ZMMWORD[640+rsp] + vpclmulqdq zmm1,zmm4,zmm19,0x11 + vpclmulqdq zmm9,zmm4,zmm19,0x00 + vpclmulqdq zmm10,zmm4,zmm19,0x01 + vpclmulqdq zmm12,zmm4,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[704+rsp] + vpclmulqdq zmm13,zmm5,zmm19,0x11 + vpclmulqdq zmm15,zmm5,zmm19,0x00 + vpclmulqdq zmm17,zmm5,zmm19,0x01 + vpclmulqdq zmm18,zmm5,zmm19,0x10 + + vpternlogq zmm7,zmm10,zmm17,0x96 + vpternlogq zmm6,zmm1,zmm13,0x96 + vpternlogq zmm8,zmm9,zmm15,0x96 + vpternlogq zmm7,zmm12,zmm18,0x96 + + vpsrldq zmm1,zmm7,8 + vpslldq zmm9,zmm7,8 + vpxorq zmm6,zmm6,zmm1 + vpxorq zmm8,zmm8,zmm9 + vextracti64x4 ymm1,zmm6,1 + vpxorq ymm6,ymm6,ymm1 + vextracti32x4 xmm1,ymm6,1 + vpxorq xmm6,xmm6,xmm1 + vextracti64x4 ymm9,zmm8,1 + vpxorq ymm8,ymm8,ymm9 + vextracti32x4 xmm9,ymm8,1 + vpxorq xmm8,xmm8,xmm9 + vmovdqa64 xmm10,XMMWORD[POLY2] + + + vpclmulqdq xmm1,xmm10,xmm8,0x01 + vpslldq xmm1,xmm1,8 + vpxorq xmm1,xmm8,xmm1 + + + vpclmulqdq xmm9,xmm10,xmm1,0x00 + vpsrldq xmm9,xmm9,4 + vpclmulqdq xmm2,xmm10,xmm1,0x10 + vpslldq xmm2,xmm2,4 + vpternlogq xmm2,xmm9,xmm6,0x96 + + sub r11,768 + je NEAR $L$_CALC_AAD_done_1 + + add r10,768 + jmp NEAR $L$_get_AAD_loop48x16_1 + +$L$_exit_AAD_loop48x16_1: + + cmp r11,512 + jl NEAR $L$_less_than_32x16_1 + + vmovdqu64 zmm11,ZMMWORD[r10] + vmovdqu64 zmm3,ZMMWORD[64+r10] + vmovdqu64 zmm4,ZMMWORD[128+r10] + vmovdqu64 zmm5,ZMMWORD[192+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpshufb zmm5,zmm5,zmm16 + test rbx,rbx + jnz NEAR $L$_skip_hkeys_precomputation_3 + + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vmovdqu64 ZMMWORD[704+rsp],zmm1 + + vmovdqu64 zmm9,ZMMWORD[224+rdx] + vmovdqu64 ZMMWORD[640+rsp],zmm9 + + + vshufi64x2 zmm9,zmm9,zmm9,0x00 + + vmovdqu64 zmm10,ZMMWORD[160+rdx] + vmovdqu64 ZMMWORD[576+rsp],zmm10 + + vmovdqu64 zmm12,ZMMWORD[96+rdx] + vmovdqu64 ZMMWORD[512+rsp],zmm12 + + vpclmulqdq zmm13,zmm10,zmm9,0x11 + vpclmulqdq zmm15,zmm10,zmm9,0x00 + vpclmulqdq zmm17,zmm10,zmm9,0x01 + vpclmulqdq zmm10,zmm10,zmm9,0x10 + vpxorq zmm10,zmm10,zmm17 + + vpsrldq zmm17,zmm10,8 + vpslldq zmm10,zmm10,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm10,zmm10,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm10,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm10,zmm10,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm10,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm10,zmm17,zmm10,0x10 + vpslldq zmm10,zmm10,4 + + vpternlogq zmm10,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[448+rsp],zmm10 + + vpclmulqdq zmm13,zmm12,zmm9,0x11 + vpclmulqdq zmm15,zmm12,zmm9,0x00 + vpclmulqdq zmm17,zmm12,zmm9,0x01 + vpclmulqdq zmm12,zmm12,zmm9,0x10 + vpxorq zmm12,zmm12,zmm17 + + vpsrldq zmm17,zmm12,8 + vpslldq zmm12,zmm12,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm12,zmm12,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm12,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm12,zmm12,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm12,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm12,zmm17,zmm12,0x10 + vpslldq zmm12,zmm12,4 + + vpternlogq zmm12,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[384+rsp],zmm12 + + vpclmulqdq zmm13,zmm10,zmm9,0x11 + vpclmulqdq zmm15,zmm10,zmm9,0x00 + vpclmulqdq zmm17,zmm10,zmm9,0x01 + vpclmulqdq zmm10,zmm10,zmm9,0x10 + vpxorq zmm10,zmm10,zmm17 + + vpsrldq zmm17,zmm10,8 + vpslldq zmm10,zmm10,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm10,zmm10,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm10,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm10,zmm10,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm10,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm10,zmm17,zmm10,0x10 + vpslldq zmm10,zmm10,4 + + vpternlogq zmm10,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[320+rsp],zmm10 + + vpclmulqdq zmm13,zmm12,zmm9,0x11 + vpclmulqdq zmm15,zmm12,zmm9,0x00 + vpclmulqdq zmm17,zmm12,zmm9,0x01 + vpclmulqdq zmm12,zmm12,zmm9,0x10 + vpxorq zmm12,zmm12,zmm17 + + vpsrldq zmm17,zmm12,8 + vpslldq zmm12,zmm12,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm12,zmm12,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm12,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm12,zmm12,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm12,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm12,zmm17,zmm12,0x10 + vpslldq zmm12,zmm12,4 + + vpternlogq zmm12,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[256+rsp],zmm12 +$L$_skip_hkeys_precomputation_3: + mov rbx,1 + vpxorq zmm11,zmm11,zmm2 + vmovdqu64 zmm19,ZMMWORD[256+rsp] + vpclmulqdq zmm1,zmm11,zmm19,0x11 + vpclmulqdq zmm9,zmm11,zmm19,0x00 + vpclmulqdq zmm10,zmm11,zmm19,0x01 + vpclmulqdq zmm12,zmm11,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[320+rsp] + vpclmulqdq zmm13,zmm3,zmm19,0x11 + vpclmulqdq zmm15,zmm3,zmm19,0x00 + vpclmulqdq zmm17,zmm3,zmm19,0x01 + vpclmulqdq zmm18,zmm3,zmm19,0x10 + vpxorq zmm7,zmm10,zmm17 + vpxorq zmm6,zmm1,zmm13 + vpxorq zmm8,zmm9,zmm15 + vpternlogq zmm7,zmm12,zmm18,0x96 + vmovdqu64 zmm19,ZMMWORD[384+rsp] + vpclmulqdq zmm1,zmm4,zmm19,0x11 + vpclmulqdq zmm9,zmm4,zmm19,0x00 + vpclmulqdq zmm10,zmm4,zmm19,0x01 + vpclmulqdq zmm12,zmm4,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[448+rsp] + vpclmulqdq zmm13,zmm5,zmm19,0x11 + vpclmulqdq zmm15,zmm5,zmm19,0x00 + vpclmulqdq zmm17,zmm5,zmm19,0x01 + vpclmulqdq zmm18,zmm5,zmm19,0x10 + + vpternlogq zmm7,zmm10,zmm17,0x96 + vpternlogq zmm6,zmm1,zmm13,0x96 + vpternlogq zmm8,zmm9,zmm15,0x96 + vpternlogq zmm7,zmm12,zmm18,0x96 + vmovdqu64 zmm11,ZMMWORD[256+r10] + vmovdqu64 zmm3,ZMMWORD[320+r10] + vmovdqu64 zmm4,ZMMWORD[384+r10] + vmovdqu64 zmm5,ZMMWORD[448+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpshufb zmm5,zmm5,zmm16 + vmovdqu64 zmm19,ZMMWORD[512+rsp] + vpclmulqdq zmm1,zmm11,zmm19,0x11 + vpclmulqdq zmm9,zmm11,zmm19,0x00 + vpclmulqdq zmm10,zmm11,zmm19,0x01 + vpclmulqdq zmm12,zmm11,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[576+rsp] + vpclmulqdq zmm13,zmm3,zmm19,0x11 + vpclmulqdq zmm15,zmm3,zmm19,0x00 + vpclmulqdq zmm17,zmm3,zmm19,0x01 + vpclmulqdq zmm18,zmm3,zmm19,0x10 + vpternlogq zmm7,zmm10,zmm17,0x96 + vpternlogq zmm6,zmm1,zmm13,0x96 + vpternlogq zmm8,zmm9,zmm15,0x96 + vpternlogq zmm7,zmm12,zmm18,0x96 + vmovdqu64 zmm19,ZMMWORD[640+rsp] + vpclmulqdq zmm1,zmm4,zmm19,0x11 + vpclmulqdq zmm9,zmm4,zmm19,0x00 + vpclmulqdq zmm10,zmm4,zmm19,0x01 + vpclmulqdq zmm12,zmm4,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[704+rsp] + vpclmulqdq zmm13,zmm5,zmm19,0x11 + vpclmulqdq zmm15,zmm5,zmm19,0x00 + vpclmulqdq zmm17,zmm5,zmm19,0x01 + vpclmulqdq zmm18,zmm5,zmm19,0x10 + + vpternlogq zmm7,zmm10,zmm17,0x96 + vpternlogq zmm6,zmm1,zmm13,0x96 + vpternlogq zmm8,zmm9,zmm15,0x96 + vpternlogq zmm7,zmm12,zmm18,0x96 + + vpsrldq zmm1,zmm7,8 + vpslldq zmm9,zmm7,8 + vpxorq zmm6,zmm6,zmm1 + vpxorq zmm8,zmm8,zmm9 + vextracti64x4 ymm1,zmm6,1 + vpxorq ymm6,ymm6,ymm1 + vextracti32x4 xmm1,ymm6,1 + vpxorq xmm6,xmm6,xmm1 + vextracti64x4 ymm9,zmm8,1 + vpxorq ymm8,ymm8,ymm9 + vextracti32x4 xmm9,ymm8,1 + vpxorq xmm8,xmm8,xmm9 + vmovdqa64 xmm10,XMMWORD[POLY2] + + + vpclmulqdq xmm1,xmm10,xmm8,0x01 + vpslldq xmm1,xmm1,8 + vpxorq xmm1,xmm8,xmm1 + + + vpclmulqdq xmm9,xmm10,xmm1,0x00 + vpsrldq xmm9,xmm9,4 + vpclmulqdq xmm2,xmm10,xmm1,0x10 + vpslldq xmm2,xmm2,4 + vpternlogq xmm2,xmm9,xmm6,0x96 + + sub r11,512 + je NEAR $L$_CALC_AAD_done_1 + + add r10,512 + jmp NEAR $L$_less_than_16x16_1 + +$L$_less_than_32x16_1: + cmp r11,256 + jl NEAR $L$_less_than_16x16_1 + + vmovdqu64 zmm11,ZMMWORD[r10] + vmovdqu64 zmm3,ZMMWORD[64+r10] + vmovdqu64 zmm4,ZMMWORD[128+r10] + vmovdqu64 zmm5,ZMMWORD[192+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpshufb zmm5,zmm5,zmm16 + vpxorq zmm11,zmm11,zmm2 + vmovdqu64 zmm19,ZMMWORD[96+rdx] + vpclmulqdq zmm1,zmm11,zmm19,0x11 + vpclmulqdq zmm9,zmm11,zmm19,0x00 + vpclmulqdq zmm10,zmm11,zmm19,0x01 + vpclmulqdq zmm12,zmm11,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[160+rdx] + vpclmulqdq zmm13,zmm3,zmm19,0x11 + vpclmulqdq zmm15,zmm3,zmm19,0x00 + vpclmulqdq zmm17,zmm3,zmm19,0x01 + vpclmulqdq zmm18,zmm3,zmm19,0x10 + vpxorq zmm7,zmm10,zmm17 + vpxorq zmm6,zmm1,zmm13 + vpxorq zmm8,zmm9,zmm15 + vpternlogq zmm7,zmm12,zmm18,0x96 + vmovdqu64 zmm19,ZMMWORD[224+rdx] + vpclmulqdq zmm1,zmm4,zmm19,0x11 + vpclmulqdq zmm9,zmm4,zmm19,0x00 + vpclmulqdq zmm10,zmm4,zmm19,0x01 + vpclmulqdq zmm12,zmm4,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[288+rdx] + vpclmulqdq zmm13,zmm5,zmm19,0x11 + vpclmulqdq zmm15,zmm5,zmm19,0x00 + vpclmulqdq zmm17,zmm5,zmm19,0x01 + vpclmulqdq zmm18,zmm5,zmm19,0x10 + + vpternlogq zmm7,zmm10,zmm17,0x96 + vpternlogq zmm6,zmm1,zmm13,0x96 + vpternlogq zmm8,zmm9,zmm15,0x96 + vpternlogq zmm7,zmm12,zmm18,0x96 + + vpsrldq zmm1,zmm7,8 + vpslldq zmm9,zmm7,8 + vpxorq zmm6,zmm6,zmm1 + vpxorq zmm8,zmm8,zmm9 + vextracti64x4 ymm1,zmm6,1 + vpxorq ymm6,ymm6,ymm1 + vextracti32x4 xmm1,ymm6,1 + vpxorq xmm6,xmm6,xmm1 + vextracti64x4 ymm9,zmm8,1 + vpxorq ymm8,ymm8,ymm9 + vextracti32x4 xmm9,ymm8,1 + vpxorq xmm8,xmm8,xmm9 + vmovdqa64 xmm10,XMMWORD[POLY2] + + + vpclmulqdq xmm1,xmm10,xmm8,0x01 + vpslldq xmm1,xmm1,8 + vpxorq xmm1,xmm8,xmm1 + + + vpclmulqdq xmm9,xmm10,xmm1,0x00 + vpsrldq xmm9,xmm9,4 + vpclmulqdq xmm2,xmm10,xmm1,0x10 + vpslldq xmm2,xmm2,4 + vpternlogq xmm2,xmm9,xmm6,0x96 + + sub r11,256 + je NEAR $L$_CALC_AAD_done_1 + + add r10,256 + +$L$_less_than_16x16_1: + + lea r12,[byte64_len_to_mask_table] + lea r12,[r11*8+r12] + + + add r11d,15 + shr r11d,4 + cmp r11d,2 + jb NEAR $L$_AAD_blocks_1_1 + je NEAR $L$_AAD_blocks_2_1 + cmp r11d,4 + jb NEAR $L$_AAD_blocks_3_1 + je NEAR $L$_AAD_blocks_4_1 + cmp r11d,6 + jb NEAR $L$_AAD_blocks_5_1 + je NEAR $L$_AAD_blocks_6_1 + cmp r11d,8 + jb NEAR $L$_AAD_blocks_7_1 + je NEAR $L$_AAD_blocks_8_1 + cmp r11d,10 + jb NEAR $L$_AAD_blocks_9_1 + je NEAR $L$_AAD_blocks_10_1 + cmp r11d,12 + jb NEAR $L$_AAD_blocks_11_1 + je NEAR $L$_AAD_blocks_12_1 + cmp r11d,14 + jb NEAR $L$_AAD_blocks_13_1 + je NEAR $L$_AAD_blocks_14_1 + cmp r11d,15 + je NEAR $L$_AAD_blocks_15_1 +$L$_AAD_blocks_16_1: + sub r12,1536 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 zmm3,ZMMWORD[64+r10] + vmovdqu8 zmm4,ZMMWORD[128+r10] + vmovdqu8 zmm5{k1}{z},[192+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpshufb zmm5,zmm5,zmm16 + vpxorq zmm11,zmm11,zmm2 + vmovdqu64 zmm15,ZMMWORD[96+rdx] + vpclmulqdq zmm1,zmm11,zmm15,0x11 + vpclmulqdq zmm6,zmm11,zmm15,0x00 + vpclmulqdq zmm7,zmm11,zmm15,0x01 + vpclmulqdq zmm8,zmm11,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[160+rdx] + vpclmulqdq zmm9,zmm3,zmm15,0x11 + vpclmulqdq zmm10,zmm3,zmm15,0x00 + vpclmulqdq zmm12,zmm3,zmm15,0x01 + vpclmulqdq zmm13,zmm3,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[224+rdx] + vpclmulqdq zmm11,zmm4,zmm15,0x11 + vpclmulqdq zmm3,zmm4,zmm15,0x00 + vpternlogq zmm1,zmm11,zmm9,0x96 + vpternlogq zmm6,zmm3,zmm10,0x96 + vpclmulqdq zmm11,zmm4,zmm15,0x01 + vpclmulqdq zmm3,zmm4,zmm15,0x10 + vpternlogq zmm7,zmm11,zmm12,0x96 + vpternlogq zmm8,zmm3,zmm13,0x96 + vmovdqu64 zmm15,ZMMWORD[288+rdx] + vpclmulqdq zmm9,zmm5,zmm15,0x11 + vpclmulqdq zmm10,zmm5,zmm15,0x00 + vpclmulqdq zmm12,zmm5,zmm15,0x01 + vpclmulqdq zmm13,zmm5,zmm15,0x10 + vpxorq zmm9,zmm1,zmm9 + vpxorq zmm10,zmm6,zmm10 + vpxorq zmm12,zmm7,zmm12 + vpxorq zmm13,zmm8,zmm13 + + vpxorq zmm12,zmm12,zmm13 + vpsrldq zmm7,zmm12,8 + vpslldq zmm8,zmm12,8 + vpxorq zmm1,zmm9,zmm7 + vpxorq zmm6,zmm10,zmm8 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm2,xmm15,xmm7,0x10 + vpslldq xmm2,xmm2,4 + vpternlogq xmm2,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_1 +$L$_AAD_blocks_15_1: + sub r12,1536 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 zmm3,ZMMWORD[64+r10] + vmovdqu8 zmm4,ZMMWORD[128+r10] + vmovdqu8 zmm5{k1}{z},[192+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpshufb zmm5,zmm5,zmm16 + vpxorq zmm11,zmm11,zmm2 + vmovdqu64 zmm15,ZMMWORD[112+rdx] + vpclmulqdq zmm1,zmm11,zmm15,0x11 + vpclmulqdq zmm6,zmm11,zmm15,0x00 + vpclmulqdq zmm7,zmm11,zmm15,0x01 + vpclmulqdq zmm8,zmm11,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[176+rdx] + vpclmulqdq zmm9,zmm3,zmm15,0x11 + vpclmulqdq zmm10,zmm3,zmm15,0x00 + vpclmulqdq zmm12,zmm3,zmm15,0x01 + vpclmulqdq zmm13,zmm3,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[240+rdx] + vpclmulqdq zmm11,zmm4,zmm15,0x11 + vpclmulqdq zmm3,zmm4,zmm15,0x00 + vpternlogq zmm9,zmm11,zmm1,0x96 + vpternlogq zmm10,zmm3,zmm6,0x96 + vpclmulqdq zmm11,zmm4,zmm15,0x01 + vpclmulqdq zmm3,zmm4,zmm15,0x10 + vpternlogq zmm12,zmm11,zmm7,0x96 + vpternlogq zmm13,zmm3,zmm8,0x96 + vmovdqu64 ymm15,YMMWORD[304+rdx] + vinserti64x2 zmm15,zmm15,ZMMWORD[336+rdx],2 + vpclmulqdq zmm7,zmm5,zmm15,0x01 + vpclmulqdq zmm8,zmm5,zmm15,0x10 + vpclmulqdq zmm1,zmm5,zmm15,0x11 + vpclmulqdq zmm6,zmm5,zmm15,0x00 + + vpxorq zmm7,zmm7,zmm12 + vpxorq zmm8,zmm8,zmm13 + vpxorq zmm1,zmm1,zmm9 + vpxorq zmm6,zmm6,zmm10 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm2,xmm15,xmm7,0x10 + vpslldq xmm2,xmm2,4 + vpternlogq xmm2,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_1 +$L$_AAD_blocks_14_1: + sub r12,1536 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 zmm3,ZMMWORD[64+r10] + vmovdqu8 zmm4,ZMMWORD[128+r10] + vmovdqu8 ymm5{k1}{z},[192+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpshufb ymm5,ymm5,ymm16 + vpxorq zmm11,zmm11,zmm2 + vmovdqu64 zmm15,ZMMWORD[128+rdx] + vpclmulqdq zmm1,zmm11,zmm15,0x11 + vpclmulqdq zmm6,zmm11,zmm15,0x00 + vpclmulqdq zmm7,zmm11,zmm15,0x01 + vpclmulqdq zmm8,zmm11,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[192+rdx] + vpclmulqdq zmm9,zmm3,zmm15,0x11 + vpclmulqdq zmm10,zmm3,zmm15,0x00 + vpclmulqdq zmm12,zmm3,zmm15,0x01 + vpclmulqdq zmm13,zmm3,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[256+rdx] + vpclmulqdq zmm11,zmm4,zmm15,0x11 + vpclmulqdq zmm3,zmm4,zmm15,0x00 + vpternlogq zmm9,zmm11,zmm1,0x96 + vpternlogq zmm10,zmm3,zmm6,0x96 + vpclmulqdq zmm11,zmm4,zmm15,0x01 + vpclmulqdq zmm3,zmm4,zmm15,0x10 + vpternlogq zmm12,zmm11,zmm7,0x96 + vpternlogq zmm13,zmm3,zmm8,0x96 + vmovdqu64 ymm15,YMMWORD[320+rdx] + vpclmulqdq ymm7,ymm5,ymm15,0x01 + vpclmulqdq ymm8,ymm5,ymm15,0x10 + vpclmulqdq ymm1,ymm5,ymm15,0x11 + vpclmulqdq ymm6,ymm5,ymm15,0x00 + + vpxorq zmm7,zmm7,zmm12 + vpxorq zmm8,zmm8,zmm13 + vpxorq zmm1,zmm1,zmm9 + vpxorq zmm6,zmm6,zmm10 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm2,xmm15,xmm7,0x10 + vpslldq xmm2,xmm2,4 + vpternlogq xmm2,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_1 +$L$_AAD_blocks_13_1: + sub r12,1536 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 zmm3,ZMMWORD[64+r10] + vmovdqu8 zmm4,ZMMWORD[128+r10] + vmovdqu8 xmm5{k1}{z},[192+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpshufb xmm5,xmm5,xmm16 + vpxorq zmm11,zmm11,zmm2 + vmovdqu64 zmm15,ZMMWORD[144+rdx] + vpclmulqdq zmm1,zmm11,zmm15,0x11 + vpclmulqdq zmm6,zmm11,zmm15,0x00 + vpclmulqdq zmm7,zmm11,zmm15,0x01 + vpclmulqdq zmm8,zmm11,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[208+rdx] + vpclmulqdq zmm9,zmm3,zmm15,0x11 + vpclmulqdq zmm10,zmm3,zmm15,0x00 + vpclmulqdq zmm12,zmm3,zmm15,0x01 + vpclmulqdq zmm13,zmm3,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[272+rdx] + vpclmulqdq zmm11,zmm4,zmm15,0x11 + vpclmulqdq zmm3,zmm4,zmm15,0x00 + vpternlogq zmm9,zmm11,zmm1,0x96 + vpternlogq zmm10,zmm3,zmm6,0x96 + vpclmulqdq zmm11,zmm4,zmm15,0x01 + vpclmulqdq zmm3,zmm4,zmm15,0x10 + vpternlogq zmm12,zmm11,zmm7,0x96 + vpternlogq zmm13,zmm3,zmm8,0x96 + vmovdqu64 xmm15,XMMWORD[336+rdx] + vpclmulqdq xmm7,xmm5,xmm15,0x01 + vpclmulqdq xmm8,xmm5,xmm15,0x10 + vpclmulqdq xmm1,xmm5,xmm15,0x11 + vpclmulqdq xmm6,xmm5,xmm15,0x00 + + vpxorq zmm7,zmm7,zmm12 + vpxorq zmm8,zmm8,zmm13 + vpxorq zmm1,zmm1,zmm9 + vpxorq zmm6,zmm6,zmm10 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm2,xmm15,xmm7,0x10 + vpslldq xmm2,xmm2,4 + vpternlogq xmm2,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_1 +$L$_AAD_blocks_12_1: + sub r12,1024 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 zmm3,ZMMWORD[64+r10] + vmovdqu8 zmm4{k1}{z},[128+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpxorq zmm11,zmm11,zmm2 + vmovdqu64 zmm15,ZMMWORD[160+rdx] + vpclmulqdq zmm1,zmm11,zmm15,0x11 + vpclmulqdq zmm6,zmm11,zmm15,0x00 + vpclmulqdq zmm7,zmm11,zmm15,0x01 + vpclmulqdq zmm8,zmm11,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[224+rdx] + vpclmulqdq zmm9,zmm3,zmm15,0x11 + vpclmulqdq zmm10,zmm3,zmm15,0x00 + vpclmulqdq zmm12,zmm3,zmm15,0x01 + vpclmulqdq zmm13,zmm3,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[288+rdx] + vpclmulqdq zmm11,zmm4,zmm15,0x11 + vpclmulqdq zmm3,zmm4,zmm15,0x00 + vpternlogq zmm9,zmm11,zmm1,0x96 + vpternlogq zmm10,zmm3,zmm6,0x96 + vpclmulqdq zmm11,zmm4,zmm15,0x01 + vpclmulqdq zmm3,zmm4,zmm15,0x10 + vpternlogq zmm12,zmm11,zmm7,0x96 + vpternlogq zmm13,zmm3,zmm8,0x96 + + vpxorq zmm12,zmm12,zmm13 + vpsrldq zmm7,zmm12,8 + vpslldq zmm8,zmm12,8 + vpxorq zmm1,zmm9,zmm7 + vpxorq zmm6,zmm10,zmm8 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm2,xmm15,xmm7,0x10 + vpslldq xmm2,xmm2,4 + vpternlogq xmm2,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_1 +$L$_AAD_blocks_11_1: + sub r12,1024 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 zmm3,ZMMWORD[64+r10] + vmovdqu8 zmm4{k1}{z},[128+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpxorq zmm11,zmm11,zmm2 + vmovdqu64 zmm15,ZMMWORD[176+rdx] + vpclmulqdq zmm1,zmm11,zmm15,0x11 + vpclmulqdq zmm6,zmm11,zmm15,0x00 + vpclmulqdq zmm7,zmm11,zmm15,0x01 + vpclmulqdq zmm8,zmm11,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[240+rdx] + vpclmulqdq zmm9,zmm3,zmm15,0x11 + vpclmulqdq zmm10,zmm3,zmm15,0x00 + vpclmulqdq zmm12,zmm3,zmm15,0x01 + vpclmulqdq zmm13,zmm3,zmm15,0x10 + vpxorq zmm9,zmm1,zmm9 + vpxorq zmm10,zmm6,zmm10 + vpxorq zmm12,zmm7,zmm12 + vpxorq zmm13,zmm8,zmm13 + vmovdqu64 ymm15,YMMWORD[304+rdx] + vinserti64x2 zmm15,zmm15,ZMMWORD[336+rdx],2 + vpclmulqdq zmm7,zmm4,zmm15,0x01 + vpclmulqdq zmm8,zmm4,zmm15,0x10 + vpclmulqdq zmm1,zmm4,zmm15,0x11 + vpclmulqdq zmm6,zmm4,zmm15,0x00 + + vpxorq zmm7,zmm7,zmm12 + vpxorq zmm8,zmm8,zmm13 + vpxorq zmm1,zmm1,zmm9 + vpxorq zmm6,zmm6,zmm10 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm2,xmm15,xmm7,0x10 + vpslldq xmm2,xmm2,4 + vpternlogq xmm2,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_1 +$L$_AAD_blocks_10_1: + sub r12,1024 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 zmm3,ZMMWORD[64+r10] + vmovdqu8 ymm4{k1}{z},[128+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb ymm4,ymm4,ymm16 + vpxorq zmm11,zmm11,zmm2 + vmovdqu64 zmm15,ZMMWORD[192+rdx] + vpclmulqdq zmm1,zmm11,zmm15,0x11 + vpclmulqdq zmm6,zmm11,zmm15,0x00 + vpclmulqdq zmm7,zmm11,zmm15,0x01 + vpclmulqdq zmm8,zmm11,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[256+rdx] + vpclmulqdq zmm9,zmm3,zmm15,0x11 + vpclmulqdq zmm10,zmm3,zmm15,0x00 + vpclmulqdq zmm12,zmm3,zmm15,0x01 + vpclmulqdq zmm13,zmm3,zmm15,0x10 + vpxorq zmm9,zmm1,zmm9 + vpxorq zmm10,zmm6,zmm10 + vpxorq zmm12,zmm7,zmm12 + vpxorq zmm13,zmm8,zmm13 + vmovdqu64 ymm15,YMMWORD[320+rdx] + vpclmulqdq ymm7,ymm4,ymm15,0x01 + vpclmulqdq ymm8,ymm4,ymm15,0x10 + vpclmulqdq ymm1,ymm4,ymm15,0x11 + vpclmulqdq ymm6,ymm4,ymm15,0x00 + + vpxorq zmm7,zmm7,zmm12 + vpxorq zmm8,zmm8,zmm13 + vpxorq zmm1,zmm1,zmm9 + vpxorq zmm6,zmm6,zmm10 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm2,xmm15,xmm7,0x10 + vpslldq xmm2,xmm2,4 + vpternlogq xmm2,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_1 +$L$_AAD_blocks_9_1: + sub r12,1024 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 zmm3,ZMMWORD[64+r10] + vmovdqu8 xmm4{k1}{z},[128+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb xmm4,xmm4,xmm16 + vpxorq zmm11,zmm11,zmm2 + vmovdqu64 zmm15,ZMMWORD[208+rdx] + vpclmulqdq zmm1,zmm11,zmm15,0x11 + vpclmulqdq zmm6,zmm11,zmm15,0x00 + vpclmulqdq zmm7,zmm11,zmm15,0x01 + vpclmulqdq zmm8,zmm11,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[272+rdx] + vpclmulqdq zmm9,zmm3,zmm15,0x11 + vpclmulqdq zmm10,zmm3,zmm15,0x00 + vpclmulqdq zmm12,zmm3,zmm15,0x01 + vpclmulqdq zmm13,zmm3,zmm15,0x10 + vpxorq zmm9,zmm1,zmm9 + vpxorq zmm10,zmm6,zmm10 + vpxorq zmm12,zmm7,zmm12 + vpxorq zmm13,zmm8,zmm13 + vmovdqu64 xmm15,XMMWORD[336+rdx] + vpclmulqdq xmm7,xmm4,xmm15,0x01 + vpclmulqdq xmm8,xmm4,xmm15,0x10 + vpclmulqdq xmm1,xmm4,xmm15,0x11 + vpclmulqdq xmm6,xmm4,xmm15,0x00 + + vpxorq zmm7,zmm7,zmm12 + vpxorq zmm8,zmm8,zmm13 + vpxorq zmm1,zmm1,zmm9 + vpxorq zmm6,zmm6,zmm10 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm2,xmm15,xmm7,0x10 + vpslldq xmm2,xmm2,4 + vpternlogq xmm2,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_1 +$L$_AAD_blocks_8_1: + sub r12,512 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 zmm3{k1}{z},[64+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpxorq zmm11,zmm11,zmm2 + vmovdqu64 zmm15,ZMMWORD[224+rdx] + vpclmulqdq zmm1,zmm11,zmm15,0x11 + vpclmulqdq zmm6,zmm11,zmm15,0x00 + vpclmulqdq zmm7,zmm11,zmm15,0x01 + vpclmulqdq zmm8,zmm11,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[288+rdx] + vpclmulqdq zmm9,zmm3,zmm15,0x11 + vpclmulqdq zmm10,zmm3,zmm15,0x00 + vpclmulqdq zmm12,zmm3,zmm15,0x01 + vpclmulqdq zmm13,zmm3,zmm15,0x10 + vpxorq zmm9,zmm1,zmm9 + vpxorq zmm10,zmm6,zmm10 + vpxorq zmm12,zmm7,zmm12 + vpxorq zmm13,zmm8,zmm13 + + vpxorq zmm12,zmm12,zmm13 + vpsrldq zmm7,zmm12,8 + vpslldq zmm8,zmm12,8 + vpxorq zmm1,zmm9,zmm7 + vpxorq zmm6,zmm10,zmm8 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm2,xmm15,xmm7,0x10 + vpslldq xmm2,xmm2,4 + vpternlogq xmm2,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_1 +$L$_AAD_blocks_7_1: + sub r12,512 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 zmm3{k1}{z},[64+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpxorq zmm11,zmm11,zmm2 + vmovdqu64 zmm15,ZMMWORD[240+rdx] + vpclmulqdq zmm9,zmm11,zmm15,0x11 + vpclmulqdq zmm10,zmm11,zmm15,0x00 + vpclmulqdq zmm12,zmm11,zmm15,0x01 + vpclmulqdq zmm13,zmm11,zmm15,0x10 + vmovdqu64 ymm15,YMMWORD[304+rdx] + vinserti64x2 zmm15,zmm15,ZMMWORD[336+rdx],2 + vpclmulqdq zmm7,zmm3,zmm15,0x01 + vpclmulqdq zmm8,zmm3,zmm15,0x10 + vpclmulqdq zmm1,zmm3,zmm15,0x11 + vpclmulqdq zmm6,zmm3,zmm15,0x00 + + vpxorq zmm7,zmm7,zmm12 + vpxorq zmm8,zmm8,zmm13 + vpxorq zmm1,zmm1,zmm9 + vpxorq zmm6,zmm6,zmm10 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm2,xmm15,xmm7,0x10 + vpslldq xmm2,xmm2,4 + vpternlogq xmm2,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_1 +$L$_AAD_blocks_6_1: + sub r12,512 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 ymm3{k1}{z},[64+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb ymm3,ymm3,ymm16 + vpxorq zmm11,zmm11,zmm2 + vmovdqu64 zmm15,ZMMWORD[256+rdx] + vpclmulqdq zmm9,zmm11,zmm15,0x11 + vpclmulqdq zmm10,zmm11,zmm15,0x00 + vpclmulqdq zmm12,zmm11,zmm15,0x01 + vpclmulqdq zmm13,zmm11,zmm15,0x10 + vmovdqu64 ymm15,YMMWORD[320+rdx] + vpclmulqdq ymm7,ymm3,ymm15,0x01 + vpclmulqdq ymm8,ymm3,ymm15,0x10 + vpclmulqdq ymm1,ymm3,ymm15,0x11 + vpclmulqdq ymm6,ymm3,ymm15,0x00 + + vpxorq zmm7,zmm7,zmm12 + vpxorq zmm8,zmm8,zmm13 + vpxorq zmm1,zmm1,zmm9 + vpxorq zmm6,zmm6,zmm10 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm2,xmm15,xmm7,0x10 + vpslldq xmm2,xmm2,4 + vpternlogq xmm2,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_1 +$L$_AAD_blocks_5_1: + sub r12,512 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 xmm3{k1}{z},[64+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb xmm3,xmm3,xmm16 + vpxorq zmm11,zmm11,zmm2 + vmovdqu64 zmm15,ZMMWORD[272+rdx] + vpclmulqdq zmm9,zmm11,zmm15,0x11 + vpclmulqdq zmm10,zmm11,zmm15,0x00 + vpclmulqdq zmm12,zmm11,zmm15,0x01 + vpclmulqdq zmm13,zmm11,zmm15,0x10 + vmovdqu64 xmm15,XMMWORD[336+rdx] + vpclmulqdq xmm7,xmm3,xmm15,0x01 + vpclmulqdq xmm8,xmm3,xmm15,0x10 + vpclmulqdq xmm1,xmm3,xmm15,0x11 + vpclmulqdq xmm6,xmm3,xmm15,0x00 + + vpxorq zmm7,zmm7,zmm12 + vpxorq zmm8,zmm8,zmm13 + vpxorq zmm1,zmm1,zmm9 + vpxorq zmm6,zmm6,zmm10 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm2,xmm15,xmm7,0x10 + vpslldq xmm2,xmm2,4 + vpternlogq xmm2,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_1 +$L$_AAD_blocks_4_1: + kmovq k1,[r12] + vmovdqu8 zmm11{k1}{z},[r10] + vpshufb zmm11,zmm11,zmm16 + vpxorq zmm11,zmm11,zmm2 + vmovdqu64 zmm15,ZMMWORD[288+rdx] + vpclmulqdq zmm9,zmm11,zmm15,0x11 + vpclmulqdq zmm10,zmm11,zmm15,0x00 + vpclmulqdq zmm12,zmm11,zmm15,0x01 + vpclmulqdq zmm13,zmm11,zmm15,0x10 + + vpxorq zmm12,zmm12,zmm13 + vpsrldq zmm7,zmm12,8 + vpslldq zmm8,zmm12,8 + vpxorq zmm1,zmm9,zmm7 + vpxorq zmm6,zmm10,zmm8 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm2,xmm15,xmm7,0x10 + vpslldq xmm2,xmm2,4 + vpternlogq xmm2,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_1 +$L$_AAD_blocks_3_1: + kmovq k1,[r12] + vmovdqu8 zmm11{k1}{z},[r10] + vpshufb zmm11,zmm11,zmm16 + vpxorq zmm11,zmm11,zmm2 + vmovdqu64 ymm15,YMMWORD[304+rdx] + vinserti64x2 zmm15,zmm15,ZMMWORD[336+rdx],2 + vpclmulqdq zmm7,zmm11,zmm15,0x01 + vpclmulqdq zmm8,zmm11,zmm15,0x10 + vpclmulqdq zmm1,zmm11,zmm15,0x11 + vpclmulqdq zmm6,zmm11,zmm15,0x00 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm2,xmm15,xmm7,0x10 + vpslldq xmm2,xmm2,4 + vpternlogq xmm2,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_1 +$L$_AAD_blocks_2_1: + kmovq k1,[r12] + vmovdqu8 ymm11{k1}{z},[r10] + vpshufb ymm11,ymm11,ymm16 + vpxorq zmm11,zmm11,zmm2 + vmovdqu64 ymm15,YMMWORD[320+rdx] + vpclmulqdq ymm7,ymm11,ymm15,0x01 + vpclmulqdq ymm8,ymm11,ymm15,0x10 + vpclmulqdq ymm1,ymm11,ymm15,0x11 + vpclmulqdq ymm6,ymm11,ymm15,0x00 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm2,xmm15,xmm7,0x10 + vpslldq xmm2,xmm2,4 + vpternlogq xmm2,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_1 +$L$_AAD_blocks_1_1: + kmovq k1,[r12] + vmovdqu8 xmm11{k1}{z},[r10] + vpshufb xmm11,xmm11,xmm16 + vpxorq zmm11,zmm11,zmm2 + vmovdqu64 xmm15,XMMWORD[336+rdx] + vpclmulqdq xmm7,xmm11,xmm15,0x01 + vpclmulqdq xmm8,xmm11,xmm15,0x10 + vpclmulqdq xmm1,xmm11,xmm15,0x11 + vpclmulqdq xmm6,xmm11,xmm15,0x00 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm2,xmm15,xmm7,0x10 + vpslldq xmm2,xmm2,4 + vpternlogq xmm2,xmm8,xmm1,0x96 + +$L$_CALC_AAD_done_1: + mov r10,r9 + shl r10,3 + vmovq xmm3,r10 + + + vpxorq xmm2,xmm3,xmm2 + + vmovdqu64 xmm1,XMMWORD[336+rdx] + + vpclmulqdq xmm11,xmm2,xmm1,0x11 + vpclmulqdq xmm3,xmm2,xmm1,0x00 + vpclmulqdq xmm4,xmm2,xmm1,0x01 + vpclmulqdq xmm2,xmm2,xmm1,0x10 + vpxorq xmm2,xmm2,xmm4 + + vpsrldq xmm4,xmm2,8 + vpslldq xmm2,xmm2,8 + vpxorq xmm11,xmm11,xmm4 + vpxorq xmm2,xmm2,xmm3 + + + + vmovdqu64 xmm4,XMMWORD[POLY2] + + vpclmulqdq xmm3,xmm4,xmm2,0x01 + vpslldq xmm3,xmm3,8 + vpxorq xmm2,xmm2,xmm3 + + + + vpclmulqdq xmm3,xmm4,xmm2,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm2,xmm4,xmm2,0x10 + vpslldq xmm2,xmm2,4 + + vpternlogq xmm2,xmm11,xmm3,0x96 + + vpshufb xmm2,xmm2,XMMWORD[SHUF_MASK] + jmp NEAR skip_iv_len_12_init_IV +iv_len_12_init_IV: + + vmovdqu8 xmm2,XMMWORD[ONEf] + mov r11,r8 + mov r10d,0x0000000000000fff + kmovq k1,r10 + vmovdqu8 xmm2{k1},[r11] +skip_iv_len_12_init_IV: + vmovdqu xmm1,xmm2 + + + mov r10d,DWORD[240+rcx] + cmp r10d,9 + je NEAR $L$aes_128_4 + cmp r10d,11 + je NEAR $L$aes_192_4 + cmp r10d,13 + je NEAR $L$aes_256_4 + jmp NEAR $L$exit_aes_4 +ALIGN 32 +$L$aes_128_4: + vpxorq xmm1,xmm1,XMMWORD[rcx] + + vaesenc xmm1,xmm1,XMMWORD[16+rcx] + + vaesenc xmm1,xmm1,XMMWORD[32+rcx] + + vaesenc xmm1,xmm1,XMMWORD[48+rcx] + + vaesenc xmm1,xmm1,XMMWORD[64+rcx] + + vaesenc xmm1,xmm1,XMMWORD[80+rcx] + + vaesenc xmm1,xmm1,XMMWORD[96+rcx] + + vaesenc xmm1,xmm1,XMMWORD[112+rcx] + + vaesenc xmm1,xmm1,XMMWORD[128+rcx] + + vaesenc xmm1,xmm1,XMMWORD[144+rcx] + + vaesenclast xmm1,xmm1,XMMWORD[160+rcx] + jmp NEAR $L$exit_aes_4 +ALIGN 32 +$L$aes_192_4: + vpxorq xmm1,xmm1,XMMWORD[rcx] + + vaesenc xmm1,xmm1,XMMWORD[16+rcx] + + vaesenc xmm1,xmm1,XMMWORD[32+rcx] + + vaesenc xmm1,xmm1,XMMWORD[48+rcx] + + vaesenc xmm1,xmm1,XMMWORD[64+rcx] + + vaesenc xmm1,xmm1,XMMWORD[80+rcx] + + vaesenc xmm1,xmm1,XMMWORD[96+rcx] + + vaesenc xmm1,xmm1,XMMWORD[112+rcx] + + vaesenc xmm1,xmm1,XMMWORD[128+rcx] + + vaesenc xmm1,xmm1,XMMWORD[144+rcx] + + vaesenc xmm1,xmm1,XMMWORD[160+rcx] + + vaesenc xmm1,xmm1,XMMWORD[176+rcx] + + vaesenclast xmm1,xmm1,XMMWORD[192+rcx] + jmp NEAR $L$exit_aes_4 +ALIGN 32 +$L$aes_256_4: + vpxorq xmm1,xmm1,XMMWORD[rcx] + + vaesenc xmm1,xmm1,XMMWORD[16+rcx] + + vaesenc xmm1,xmm1,XMMWORD[32+rcx] + + vaesenc xmm1,xmm1,XMMWORD[48+rcx] + + vaesenc xmm1,xmm1,XMMWORD[64+rcx] + + vaesenc xmm1,xmm1,XMMWORD[80+rcx] + + vaesenc xmm1,xmm1,XMMWORD[96+rcx] + + vaesenc xmm1,xmm1,XMMWORD[112+rcx] + + vaesenc xmm1,xmm1,XMMWORD[128+rcx] + + vaesenc xmm1,xmm1,XMMWORD[144+rcx] + + vaesenc xmm1,xmm1,XMMWORD[160+rcx] + + vaesenc xmm1,xmm1,XMMWORD[176+rcx] + + vaesenc xmm1,xmm1,XMMWORD[192+rcx] + + vaesenc xmm1,xmm1,XMMWORD[208+rcx] + + vaesenclast xmm1,xmm1,XMMWORD[224+rcx] + jmp NEAR $L$exit_aes_4 +$L$exit_aes_4: + + vmovdqu XMMWORD[32+rdx],xmm1 + + + vpshufb xmm2,xmm2,XMMWORD[SHUF_MASK] + vmovdqu XMMWORD[rdx],xmm2 + cmp r9,256 + jbe NEAR $L$skip_hkeys_cleanup_5 + vpxor xmm0,xmm0,xmm0 + vmovdqa64 ZMMWORD[rsp],zmm0 + vmovdqa64 ZMMWORD[64+rsp],zmm0 + vmovdqa64 ZMMWORD[128+rsp],zmm0 + vmovdqa64 ZMMWORD[192+rsp],zmm0 + vmovdqa64 ZMMWORD[256+rsp],zmm0 + vmovdqa64 ZMMWORD[320+rsp],zmm0 + vmovdqa64 ZMMWORD[384+rsp],zmm0 + vmovdqa64 ZMMWORD[448+rsp],zmm0 + vmovdqa64 ZMMWORD[512+rsp],zmm0 + vmovdqa64 ZMMWORD[576+rsp],zmm0 + vmovdqa64 ZMMWORD[640+rsp],zmm0 + vmovdqa64 ZMMWORD[704+rsp],zmm0 +$L$skip_hkeys_cleanup_5: + vzeroupper + vmovdqu xmm15,XMMWORD[((-16))+rbp] + vmovdqu xmm14,XMMWORD[((-32))+rbp] + vmovdqu xmm13,XMMWORD[((-48))+rbp] + vmovdqu xmm12,XMMWORD[((-64))+rbp] + vmovdqu xmm11,XMMWORD[((-80))+rbp] + vmovdqu xmm10,XMMWORD[((-96))+rbp] + vmovdqu xmm9,XMMWORD[((-112))+rbp] + vmovdqu xmm8,XMMWORD[((-128))+rbp] + vmovdqu xmm7,XMMWORD[((-144))+rbp] + vmovdqu xmm6,XMMWORD[((-160))+rbp] + lea rsp,[8+rbp] + pop rsi + + pop rdi + + pop r15 + + pop r14 + + pop r13 + + pop r12 + + pop rbp + + pop rbx + +$L$abort_setiv: + DB 0F3h,0C3h ;repret +$L$setiv_seh_end: + + +global ossl_aes_gcm_update_aad_avx512 + +ALIGN 32 +ossl_aes_gcm_update_aad_avx512: + +$L$ghash_seh_begin: +DB 243,15,30,250 + push rbx + +$L$ghash_seh_push_rbx: + push rbp + +$L$ghash_seh_push_rbp: + push r12 + +$L$ghash_seh_push_r12: + push r13 + +$L$ghash_seh_push_r13: + push r14 + +$L$ghash_seh_push_r14: + push r15 + +$L$ghash_seh_push_r15: + push rdi +$L$ghash_seh_push_rdi: + push rsi +$L$ghash_seh_push_rsi: + + sub rsp,168 +$L$ghash_seh_allocstack_xmm: + + + + + + + + + + + lea rbp,[160+rsp] + +$L$ghash_seh_setfp: + vmovdqu XMMWORD[rsp],xmm6 +$L$ghash_seh_save_xmm6: + vmovdqu XMMWORD[16+rsp],xmm7 +$L$ghash_seh_save_xmm7: + vmovdqu XMMWORD[32+rsp],xmm8 +$L$ghash_seh_save_xmm8: + vmovdqu XMMWORD[48+rsp],xmm9 +$L$ghash_seh_save_xmm9: + vmovdqu XMMWORD[64+rsp],xmm10 +$L$ghash_seh_save_xmm10: + vmovdqu XMMWORD[80+rsp],xmm11 +$L$ghash_seh_save_xmm11: + vmovdqu XMMWORD[96+rsp],xmm12 +$L$ghash_seh_save_xmm12: + vmovdqu XMMWORD[112+rsp],xmm13 +$L$ghash_seh_save_xmm13: + vmovdqu XMMWORD[128+rsp],xmm14 +$L$ghash_seh_save_xmm14: + vmovdqu XMMWORD[144+rsp],xmm15 +$L$ghash_seh_save_xmm15: + +$L$ghash_seh_prolog_end: + sub rsp,816 + and rsp,(-64) + vmovdqu64 xmm14,XMMWORD[64+rcx] + mov r10,rdx + mov r11,r8 + or r11,r11 + jz NEAR $L$_CALC_AAD_done_6 + + xor rbx,rbx + vmovdqa64 zmm16,ZMMWORD[SHUF_MASK] + +$L$_get_AAD_loop48x16_6: + cmp r11,768 + jl NEAR $L$_exit_AAD_loop48x16_6 + vmovdqu64 zmm11,ZMMWORD[r10] + vmovdqu64 zmm3,ZMMWORD[64+r10] + vmovdqu64 zmm4,ZMMWORD[128+r10] + vmovdqu64 zmm5,ZMMWORD[192+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpshufb zmm5,zmm5,zmm16 + test rbx,rbx + jnz NEAR $L$_skip_hkeys_precomputation_7 + + vmovdqu64 zmm1,ZMMWORD[288+rcx] + vmovdqu64 ZMMWORD[704+rsp],zmm1 + + vmovdqu64 zmm9,ZMMWORD[224+rcx] + vmovdqu64 ZMMWORD[640+rsp],zmm9 + + + vshufi64x2 zmm9,zmm9,zmm9,0x00 + + vmovdqu64 zmm10,ZMMWORD[160+rcx] + vmovdqu64 ZMMWORD[576+rsp],zmm10 + + vmovdqu64 zmm12,ZMMWORD[96+rcx] + vmovdqu64 ZMMWORD[512+rsp],zmm12 + + vpclmulqdq zmm13,zmm10,zmm9,0x11 + vpclmulqdq zmm15,zmm10,zmm9,0x00 + vpclmulqdq zmm17,zmm10,zmm9,0x01 + vpclmulqdq zmm10,zmm10,zmm9,0x10 + vpxorq zmm10,zmm10,zmm17 + + vpsrldq zmm17,zmm10,8 + vpslldq zmm10,zmm10,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm10,zmm10,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm10,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm10,zmm10,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm10,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm10,zmm17,zmm10,0x10 + vpslldq zmm10,zmm10,4 + + vpternlogq zmm10,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[448+rsp],zmm10 + + vpclmulqdq zmm13,zmm12,zmm9,0x11 + vpclmulqdq zmm15,zmm12,zmm9,0x00 + vpclmulqdq zmm17,zmm12,zmm9,0x01 + vpclmulqdq zmm12,zmm12,zmm9,0x10 + vpxorq zmm12,zmm12,zmm17 + + vpsrldq zmm17,zmm12,8 + vpslldq zmm12,zmm12,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm12,zmm12,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm12,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm12,zmm12,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm12,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm12,zmm17,zmm12,0x10 + vpslldq zmm12,zmm12,4 + + vpternlogq zmm12,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[384+rsp],zmm12 + + vpclmulqdq zmm13,zmm10,zmm9,0x11 + vpclmulqdq zmm15,zmm10,zmm9,0x00 + vpclmulqdq zmm17,zmm10,zmm9,0x01 + vpclmulqdq zmm10,zmm10,zmm9,0x10 + vpxorq zmm10,zmm10,zmm17 + + vpsrldq zmm17,zmm10,8 + vpslldq zmm10,zmm10,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm10,zmm10,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm10,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm10,zmm10,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm10,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm10,zmm17,zmm10,0x10 + vpslldq zmm10,zmm10,4 + + vpternlogq zmm10,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[320+rsp],zmm10 + + vpclmulqdq zmm13,zmm12,zmm9,0x11 + vpclmulqdq zmm15,zmm12,zmm9,0x00 + vpclmulqdq zmm17,zmm12,zmm9,0x01 + vpclmulqdq zmm12,zmm12,zmm9,0x10 + vpxorq zmm12,zmm12,zmm17 + + vpsrldq zmm17,zmm12,8 + vpslldq zmm12,zmm12,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm12,zmm12,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm12,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm12,zmm12,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm12,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm12,zmm17,zmm12,0x10 + vpslldq zmm12,zmm12,4 + + vpternlogq zmm12,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[256+rsp],zmm12 + + vpclmulqdq zmm13,zmm10,zmm9,0x11 + vpclmulqdq zmm15,zmm10,zmm9,0x00 + vpclmulqdq zmm17,zmm10,zmm9,0x01 + vpclmulqdq zmm10,zmm10,zmm9,0x10 + vpxorq zmm10,zmm10,zmm17 + + vpsrldq zmm17,zmm10,8 + vpslldq zmm10,zmm10,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm10,zmm10,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm10,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm10,zmm10,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm10,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm10,zmm17,zmm10,0x10 + vpslldq zmm10,zmm10,4 + + vpternlogq zmm10,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[192+rsp],zmm10 + + vpclmulqdq zmm13,zmm12,zmm9,0x11 + vpclmulqdq zmm15,zmm12,zmm9,0x00 + vpclmulqdq zmm17,zmm12,zmm9,0x01 + vpclmulqdq zmm12,zmm12,zmm9,0x10 + vpxorq zmm12,zmm12,zmm17 + + vpsrldq zmm17,zmm12,8 + vpslldq zmm12,zmm12,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm12,zmm12,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm12,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm12,zmm12,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm12,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm12,zmm17,zmm12,0x10 + vpslldq zmm12,zmm12,4 + + vpternlogq zmm12,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[128+rsp],zmm12 + + vpclmulqdq zmm13,zmm10,zmm9,0x11 + vpclmulqdq zmm15,zmm10,zmm9,0x00 + vpclmulqdq zmm17,zmm10,zmm9,0x01 + vpclmulqdq zmm10,zmm10,zmm9,0x10 + vpxorq zmm10,zmm10,zmm17 + + vpsrldq zmm17,zmm10,8 + vpslldq zmm10,zmm10,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm10,zmm10,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm10,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm10,zmm10,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm10,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm10,zmm17,zmm10,0x10 + vpslldq zmm10,zmm10,4 + + vpternlogq zmm10,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[64+rsp],zmm10 + + vpclmulqdq zmm13,zmm12,zmm9,0x11 + vpclmulqdq zmm15,zmm12,zmm9,0x00 + vpclmulqdq zmm17,zmm12,zmm9,0x01 + vpclmulqdq zmm12,zmm12,zmm9,0x10 + vpxorq zmm12,zmm12,zmm17 + + vpsrldq zmm17,zmm12,8 + vpslldq zmm12,zmm12,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm12,zmm12,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm12,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm12,zmm12,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm12,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm12,zmm17,zmm12,0x10 + vpslldq zmm12,zmm12,4 + + vpternlogq zmm12,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[rsp],zmm12 +$L$_skip_hkeys_precomputation_7: + mov rbx,1 + vpxorq zmm11,zmm11,zmm14 + vmovdqu64 zmm19,ZMMWORD[rsp] + vpclmulqdq zmm1,zmm11,zmm19,0x11 + vpclmulqdq zmm9,zmm11,zmm19,0x00 + vpclmulqdq zmm10,zmm11,zmm19,0x01 + vpclmulqdq zmm12,zmm11,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[64+rsp] + vpclmulqdq zmm13,zmm3,zmm19,0x11 + vpclmulqdq zmm15,zmm3,zmm19,0x00 + vpclmulqdq zmm17,zmm3,zmm19,0x01 + vpclmulqdq zmm18,zmm3,zmm19,0x10 + vpxorq zmm7,zmm10,zmm17 + vpxorq zmm6,zmm1,zmm13 + vpxorq zmm8,zmm9,zmm15 + vpternlogq zmm7,zmm12,zmm18,0x96 + vmovdqu64 zmm19,ZMMWORD[128+rsp] + vpclmulqdq zmm1,zmm4,zmm19,0x11 + vpclmulqdq zmm9,zmm4,zmm19,0x00 + vpclmulqdq zmm10,zmm4,zmm19,0x01 + vpclmulqdq zmm12,zmm4,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[192+rsp] + vpclmulqdq zmm13,zmm5,zmm19,0x11 + vpclmulqdq zmm15,zmm5,zmm19,0x00 + vpclmulqdq zmm17,zmm5,zmm19,0x01 + vpclmulqdq zmm18,zmm5,zmm19,0x10 + + vpternlogq zmm7,zmm10,zmm17,0x96 + vpternlogq zmm6,zmm1,zmm13,0x96 + vpternlogq zmm8,zmm9,zmm15,0x96 + vpternlogq zmm7,zmm12,zmm18,0x96 + vmovdqu64 zmm11,ZMMWORD[256+r10] + vmovdqu64 zmm3,ZMMWORD[320+r10] + vmovdqu64 zmm4,ZMMWORD[384+r10] + vmovdqu64 zmm5,ZMMWORD[448+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpshufb zmm5,zmm5,zmm16 + vmovdqu64 zmm19,ZMMWORD[256+rsp] + vpclmulqdq zmm1,zmm11,zmm19,0x11 + vpclmulqdq zmm9,zmm11,zmm19,0x00 + vpclmulqdq zmm10,zmm11,zmm19,0x01 + vpclmulqdq zmm12,zmm11,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[320+rsp] + vpclmulqdq zmm13,zmm3,zmm19,0x11 + vpclmulqdq zmm15,zmm3,zmm19,0x00 + vpclmulqdq zmm17,zmm3,zmm19,0x01 + vpclmulqdq zmm18,zmm3,zmm19,0x10 + vpternlogq zmm7,zmm10,zmm17,0x96 + vpternlogq zmm6,zmm1,zmm13,0x96 + vpternlogq zmm8,zmm9,zmm15,0x96 + vpternlogq zmm7,zmm12,zmm18,0x96 + vmovdqu64 zmm19,ZMMWORD[384+rsp] + vpclmulqdq zmm1,zmm4,zmm19,0x11 + vpclmulqdq zmm9,zmm4,zmm19,0x00 + vpclmulqdq zmm10,zmm4,zmm19,0x01 + vpclmulqdq zmm12,zmm4,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[448+rsp] + vpclmulqdq zmm13,zmm5,zmm19,0x11 + vpclmulqdq zmm15,zmm5,zmm19,0x00 + vpclmulqdq zmm17,zmm5,zmm19,0x01 + vpclmulqdq zmm18,zmm5,zmm19,0x10 + + vpternlogq zmm7,zmm10,zmm17,0x96 + vpternlogq zmm6,zmm1,zmm13,0x96 + vpternlogq zmm8,zmm9,zmm15,0x96 + vpternlogq zmm7,zmm12,zmm18,0x96 + vmovdqu64 zmm11,ZMMWORD[512+r10] + vmovdqu64 zmm3,ZMMWORD[576+r10] + vmovdqu64 zmm4,ZMMWORD[640+r10] + vmovdqu64 zmm5,ZMMWORD[704+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpshufb zmm5,zmm5,zmm16 + vmovdqu64 zmm19,ZMMWORD[512+rsp] + vpclmulqdq zmm1,zmm11,zmm19,0x11 + vpclmulqdq zmm9,zmm11,zmm19,0x00 + vpclmulqdq zmm10,zmm11,zmm19,0x01 + vpclmulqdq zmm12,zmm11,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[576+rsp] + vpclmulqdq zmm13,zmm3,zmm19,0x11 + vpclmulqdq zmm15,zmm3,zmm19,0x00 + vpclmulqdq zmm17,zmm3,zmm19,0x01 + vpclmulqdq zmm18,zmm3,zmm19,0x10 + vpternlogq zmm7,zmm10,zmm17,0x96 + vpternlogq zmm6,zmm1,zmm13,0x96 + vpternlogq zmm8,zmm9,zmm15,0x96 + vpternlogq zmm7,zmm12,zmm18,0x96 + vmovdqu64 zmm19,ZMMWORD[640+rsp] + vpclmulqdq zmm1,zmm4,zmm19,0x11 + vpclmulqdq zmm9,zmm4,zmm19,0x00 + vpclmulqdq zmm10,zmm4,zmm19,0x01 + vpclmulqdq zmm12,zmm4,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[704+rsp] + vpclmulqdq zmm13,zmm5,zmm19,0x11 + vpclmulqdq zmm15,zmm5,zmm19,0x00 + vpclmulqdq zmm17,zmm5,zmm19,0x01 + vpclmulqdq zmm18,zmm5,zmm19,0x10 + + vpternlogq zmm7,zmm10,zmm17,0x96 + vpternlogq zmm6,zmm1,zmm13,0x96 + vpternlogq zmm8,zmm9,zmm15,0x96 + vpternlogq zmm7,zmm12,zmm18,0x96 + + vpsrldq zmm1,zmm7,8 + vpslldq zmm9,zmm7,8 + vpxorq zmm6,zmm6,zmm1 + vpxorq zmm8,zmm8,zmm9 + vextracti64x4 ymm1,zmm6,1 + vpxorq ymm6,ymm6,ymm1 + vextracti32x4 xmm1,ymm6,1 + vpxorq xmm6,xmm6,xmm1 + vextracti64x4 ymm9,zmm8,1 + vpxorq ymm8,ymm8,ymm9 + vextracti32x4 xmm9,ymm8,1 + vpxorq xmm8,xmm8,xmm9 + vmovdqa64 xmm10,XMMWORD[POLY2] + + + vpclmulqdq xmm1,xmm10,xmm8,0x01 + vpslldq xmm1,xmm1,8 + vpxorq xmm1,xmm8,xmm1 + + + vpclmulqdq xmm9,xmm10,xmm1,0x00 + vpsrldq xmm9,xmm9,4 + vpclmulqdq xmm14,xmm10,xmm1,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm9,xmm6,0x96 + + sub r11,768 + je NEAR $L$_CALC_AAD_done_6 + + add r10,768 + jmp NEAR $L$_get_AAD_loop48x16_6 + +$L$_exit_AAD_loop48x16_6: + + cmp r11,512 + jl NEAR $L$_less_than_32x16_6 + + vmovdqu64 zmm11,ZMMWORD[r10] + vmovdqu64 zmm3,ZMMWORD[64+r10] + vmovdqu64 zmm4,ZMMWORD[128+r10] + vmovdqu64 zmm5,ZMMWORD[192+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpshufb zmm5,zmm5,zmm16 + test rbx,rbx + jnz NEAR $L$_skip_hkeys_precomputation_8 + + vmovdqu64 zmm1,ZMMWORD[288+rcx] + vmovdqu64 ZMMWORD[704+rsp],zmm1 + + vmovdqu64 zmm9,ZMMWORD[224+rcx] + vmovdqu64 ZMMWORD[640+rsp],zmm9 + + + vshufi64x2 zmm9,zmm9,zmm9,0x00 + + vmovdqu64 zmm10,ZMMWORD[160+rcx] + vmovdqu64 ZMMWORD[576+rsp],zmm10 + + vmovdqu64 zmm12,ZMMWORD[96+rcx] + vmovdqu64 ZMMWORD[512+rsp],zmm12 + + vpclmulqdq zmm13,zmm10,zmm9,0x11 + vpclmulqdq zmm15,zmm10,zmm9,0x00 + vpclmulqdq zmm17,zmm10,zmm9,0x01 + vpclmulqdq zmm10,zmm10,zmm9,0x10 + vpxorq zmm10,zmm10,zmm17 + + vpsrldq zmm17,zmm10,8 + vpslldq zmm10,zmm10,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm10,zmm10,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm10,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm10,zmm10,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm10,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm10,zmm17,zmm10,0x10 + vpslldq zmm10,zmm10,4 + + vpternlogq zmm10,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[448+rsp],zmm10 + + vpclmulqdq zmm13,zmm12,zmm9,0x11 + vpclmulqdq zmm15,zmm12,zmm9,0x00 + vpclmulqdq zmm17,zmm12,zmm9,0x01 + vpclmulqdq zmm12,zmm12,zmm9,0x10 + vpxorq zmm12,zmm12,zmm17 + + vpsrldq zmm17,zmm12,8 + vpslldq zmm12,zmm12,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm12,zmm12,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm12,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm12,zmm12,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm12,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm12,zmm17,zmm12,0x10 + vpslldq zmm12,zmm12,4 + + vpternlogq zmm12,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[384+rsp],zmm12 + + vpclmulqdq zmm13,zmm10,zmm9,0x11 + vpclmulqdq zmm15,zmm10,zmm9,0x00 + vpclmulqdq zmm17,zmm10,zmm9,0x01 + vpclmulqdq zmm10,zmm10,zmm9,0x10 + vpxorq zmm10,zmm10,zmm17 + + vpsrldq zmm17,zmm10,8 + vpslldq zmm10,zmm10,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm10,zmm10,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm10,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm10,zmm10,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm10,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm10,zmm17,zmm10,0x10 + vpslldq zmm10,zmm10,4 + + vpternlogq zmm10,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[320+rsp],zmm10 + + vpclmulqdq zmm13,zmm12,zmm9,0x11 + vpclmulqdq zmm15,zmm12,zmm9,0x00 + vpclmulqdq zmm17,zmm12,zmm9,0x01 + vpclmulqdq zmm12,zmm12,zmm9,0x10 + vpxorq zmm12,zmm12,zmm17 + + vpsrldq zmm17,zmm12,8 + vpslldq zmm12,zmm12,8 + vpxorq zmm13,zmm13,zmm17 + vpxorq zmm12,zmm12,zmm15 + + + + vmovdqu64 zmm17,ZMMWORD[POLY2] + + vpclmulqdq zmm15,zmm17,zmm12,0x01 + vpslldq zmm15,zmm15,8 + vpxorq zmm12,zmm12,zmm15 + + + + vpclmulqdq zmm15,zmm17,zmm12,0x00 + vpsrldq zmm15,zmm15,4 + vpclmulqdq zmm12,zmm17,zmm12,0x10 + vpslldq zmm12,zmm12,4 + + vpternlogq zmm12,zmm13,zmm15,0x96 + + vmovdqu64 ZMMWORD[256+rsp],zmm12 +$L$_skip_hkeys_precomputation_8: + mov rbx,1 + vpxorq zmm11,zmm11,zmm14 + vmovdqu64 zmm19,ZMMWORD[256+rsp] + vpclmulqdq zmm1,zmm11,zmm19,0x11 + vpclmulqdq zmm9,zmm11,zmm19,0x00 + vpclmulqdq zmm10,zmm11,zmm19,0x01 + vpclmulqdq zmm12,zmm11,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[320+rsp] + vpclmulqdq zmm13,zmm3,zmm19,0x11 + vpclmulqdq zmm15,zmm3,zmm19,0x00 + vpclmulqdq zmm17,zmm3,zmm19,0x01 + vpclmulqdq zmm18,zmm3,zmm19,0x10 + vpxorq zmm7,zmm10,zmm17 + vpxorq zmm6,zmm1,zmm13 + vpxorq zmm8,zmm9,zmm15 + vpternlogq zmm7,zmm12,zmm18,0x96 + vmovdqu64 zmm19,ZMMWORD[384+rsp] + vpclmulqdq zmm1,zmm4,zmm19,0x11 + vpclmulqdq zmm9,zmm4,zmm19,0x00 + vpclmulqdq zmm10,zmm4,zmm19,0x01 + vpclmulqdq zmm12,zmm4,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[448+rsp] + vpclmulqdq zmm13,zmm5,zmm19,0x11 + vpclmulqdq zmm15,zmm5,zmm19,0x00 + vpclmulqdq zmm17,zmm5,zmm19,0x01 + vpclmulqdq zmm18,zmm5,zmm19,0x10 + + vpternlogq zmm7,zmm10,zmm17,0x96 + vpternlogq zmm6,zmm1,zmm13,0x96 + vpternlogq zmm8,zmm9,zmm15,0x96 + vpternlogq zmm7,zmm12,zmm18,0x96 + vmovdqu64 zmm11,ZMMWORD[256+r10] + vmovdqu64 zmm3,ZMMWORD[320+r10] + vmovdqu64 zmm4,ZMMWORD[384+r10] + vmovdqu64 zmm5,ZMMWORD[448+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpshufb zmm5,zmm5,zmm16 + vmovdqu64 zmm19,ZMMWORD[512+rsp] + vpclmulqdq zmm1,zmm11,zmm19,0x11 + vpclmulqdq zmm9,zmm11,zmm19,0x00 + vpclmulqdq zmm10,zmm11,zmm19,0x01 + vpclmulqdq zmm12,zmm11,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[576+rsp] + vpclmulqdq zmm13,zmm3,zmm19,0x11 + vpclmulqdq zmm15,zmm3,zmm19,0x00 + vpclmulqdq zmm17,zmm3,zmm19,0x01 + vpclmulqdq zmm18,zmm3,zmm19,0x10 + vpternlogq zmm7,zmm10,zmm17,0x96 + vpternlogq zmm6,zmm1,zmm13,0x96 + vpternlogq zmm8,zmm9,zmm15,0x96 + vpternlogq zmm7,zmm12,zmm18,0x96 + vmovdqu64 zmm19,ZMMWORD[640+rsp] + vpclmulqdq zmm1,zmm4,zmm19,0x11 + vpclmulqdq zmm9,zmm4,zmm19,0x00 + vpclmulqdq zmm10,zmm4,zmm19,0x01 + vpclmulqdq zmm12,zmm4,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[704+rsp] + vpclmulqdq zmm13,zmm5,zmm19,0x11 + vpclmulqdq zmm15,zmm5,zmm19,0x00 + vpclmulqdq zmm17,zmm5,zmm19,0x01 + vpclmulqdq zmm18,zmm5,zmm19,0x10 + + vpternlogq zmm7,zmm10,zmm17,0x96 + vpternlogq zmm6,zmm1,zmm13,0x96 + vpternlogq zmm8,zmm9,zmm15,0x96 + vpternlogq zmm7,zmm12,zmm18,0x96 + + vpsrldq zmm1,zmm7,8 + vpslldq zmm9,zmm7,8 + vpxorq zmm6,zmm6,zmm1 + vpxorq zmm8,zmm8,zmm9 + vextracti64x4 ymm1,zmm6,1 + vpxorq ymm6,ymm6,ymm1 + vextracti32x4 xmm1,ymm6,1 + vpxorq xmm6,xmm6,xmm1 + vextracti64x4 ymm9,zmm8,1 + vpxorq ymm8,ymm8,ymm9 + vextracti32x4 xmm9,ymm8,1 + vpxorq xmm8,xmm8,xmm9 + vmovdqa64 xmm10,XMMWORD[POLY2] + + + vpclmulqdq xmm1,xmm10,xmm8,0x01 + vpslldq xmm1,xmm1,8 + vpxorq xmm1,xmm8,xmm1 + + + vpclmulqdq xmm9,xmm10,xmm1,0x00 + vpsrldq xmm9,xmm9,4 + vpclmulqdq xmm14,xmm10,xmm1,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm9,xmm6,0x96 + + sub r11,512 + je NEAR $L$_CALC_AAD_done_6 + + add r10,512 + jmp NEAR $L$_less_than_16x16_6 + +$L$_less_than_32x16_6: + cmp r11,256 + jl NEAR $L$_less_than_16x16_6 + + vmovdqu64 zmm11,ZMMWORD[r10] + vmovdqu64 zmm3,ZMMWORD[64+r10] + vmovdqu64 zmm4,ZMMWORD[128+r10] + vmovdqu64 zmm5,ZMMWORD[192+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpshufb zmm5,zmm5,zmm16 + vpxorq zmm11,zmm11,zmm14 + vmovdqu64 zmm19,ZMMWORD[96+rcx] + vpclmulqdq zmm1,zmm11,zmm19,0x11 + vpclmulqdq zmm9,zmm11,zmm19,0x00 + vpclmulqdq zmm10,zmm11,zmm19,0x01 + vpclmulqdq zmm12,zmm11,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[160+rcx] + vpclmulqdq zmm13,zmm3,zmm19,0x11 + vpclmulqdq zmm15,zmm3,zmm19,0x00 + vpclmulqdq zmm17,zmm3,zmm19,0x01 + vpclmulqdq zmm18,zmm3,zmm19,0x10 + vpxorq zmm7,zmm10,zmm17 + vpxorq zmm6,zmm1,zmm13 + vpxorq zmm8,zmm9,zmm15 + vpternlogq zmm7,zmm12,zmm18,0x96 + vmovdqu64 zmm19,ZMMWORD[224+rcx] + vpclmulqdq zmm1,zmm4,zmm19,0x11 + vpclmulqdq zmm9,zmm4,zmm19,0x00 + vpclmulqdq zmm10,zmm4,zmm19,0x01 + vpclmulqdq zmm12,zmm4,zmm19,0x10 + vmovdqu64 zmm19,ZMMWORD[288+rcx] + vpclmulqdq zmm13,zmm5,zmm19,0x11 + vpclmulqdq zmm15,zmm5,zmm19,0x00 + vpclmulqdq zmm17,zmm5,zmm19,0x01 + vpclmulqdq zmm18,zmm5,zmm19,0x10 + + vpternlogq zmm7,zmm10,zmm17,0x96 + vpternlogq zmm6,zmm1,zmm13,0x96 + vpternlogq zmm8,zmm9,zmm15,0x96 + vpternlogq zmm7,zmm12,zmm18,0x96 + + vpsrldq zmm1,zmm7,8 + vpslldq zmm9,zmm7,8 + vpxorq zmm6,zmm6,zmm1 + vpxorq zmm8,zmm8,zmm9 + vextracti64x4 ymm1,zmm6,1 + vpxorq ymm6,ymm6,ymm1 + vextracti32x4 xmm1,ymm6,1 + vpxorq xmm6,xmm6,xmm1 + vextracti64x4 ymm9,zmm8,1 + vpxorq ymm8,ymm8,ymm9 + vextracti32x4 xmm9,ymm8,1 + vpxorq xmm8,xmm8,xmm9 + vmovdqa64 xmm10,XMMWORD[POLY2] + + + vpclmulqdq xmm1,xmm10,xmm8,0x01 + vpslldq xmm1,xmm1,8 + vpxorq xmm1,xmm8,xmm1 + + + vpclmulqdq xmm9,xmm10,xmm1,0x00 + vpsrldq xmm9,xmm9,4 + vpclmulqdq xmm14,xmm10,xmm1,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm9,xmm6,0x96 + + sub r11,256 + je NEAR $L$_CALC_AAD_done_6 + + add r10,256 + +$L$_less_than_16x16_6: + + lea r12,[byte64_len_to_mask_table] + lea r12,[r11*8+r12] + + + add r11d,15 + shr r11d,4 + cmp r11d,2 + jb NEAR $L$_AAD_blocks_1_6 + je NEAR $L$_AAD_blocks_2_6 + cmp r11d,4 + jb NEAR $L$_AAD_blocks_3_6 + je NEAR $L$_AAD_blocks_4_6 + cmp r11d,6 + jb NEAR $L$_AAD_blocks_5_6 + je NEAR $L$_AAD_blocks_6_6 + cmp r11d,8 + jb NEAR $L$_AAD_blocks_7_6 + je NEAR $L$_AAD_blocks_8_6 + cmp r11d,10 + jb NEAR $L$_AAD_blocks_9_6 + je NEAR $L$_AAD_blocks_10_6 + cmp r11d,12 + jb NEAR $L$_AAD_blocks_11_6 + je NEAR $L$_AAD_blocks_12_6 + cmp r11d,14 + jb NEAR $L$_AAD_blocks_13_6 + je NEAR $L$_AAD_blocks_14_6 + cmp r11d,15 + je NEAR $L$_AAD_blocks_15_6 +$L$_AAD_blocks_16_6: + sub r12,1536 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 zmm3,ZMMWORD[64+r10] + vmovdqu8 zmm4,ZMMWORD[128+r10] + vmovdqu8 zmm5{k1}{z},[192+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpshufb zmm5,zmm5,zmm16 + vpxorq zmm11,zmm11,zmm14 + vmovdqu64 zmm15,ZMMWORD[96+rcx] + vpclmulqdq zmm1,zmm11,zmm15,0x11 + vpclmulqdq zmm6,zmm11,zmm15,0x00 + vpclmulqdq zmm7,zmm11,zmm15,0x01 + vpclmulqdq zmm8,zmm11,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[160+rcx] + vpclmulqdq zmm9,zmm3,zmm15,0x11 + vpclmulqdq zmm10,zmm3,zmm15,0x00 + vpclmulqdq zmm12,zmm3,zmm15,0x01 + vpclmulqdq zmm13,zmm3,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[224+rcx] + vpclmulqdq zmm11,zmm4,zmm15,0x11 + vpclmulqdq zmm3,zmm4,zmm15,0x00 + vpternlogq zmm1,zmm11,zmm9,0x96 + vpternlogq zmm6,zmm3,zmm10,0x96 + vpclmulqdq zmm11,zmm4,zmm15,0x01 + vpclmulqdq zmm3,zmm4,zmm15,0x10 + vpternlogq zmm7,zmm11,zmm12,0x96 + vpternlogq zmm8,zmm3,zmm13,0x96 + vmovdqu64 zmm15,ZMMWORD[288+rcx] + vpclmulqdq zmm9,zmm5,zmm15,0x11 + vpclmulqdq zmm10,zmm5,zmm15,0x00 + vpclmulqdq zmm12,zmm5,zmm15,0x01 + vpclmulqdq zmm13,zmm5,zmm15,0x10 + vpxorq zmm9,zmm1,zmm9 + vpxorq zmm10,zmm6,zmm10 + vpxorq zmm12,zmm7,zmm12 + vpxorq zmm13,zmm8,zmm13 + + vpxorq zmm12,zmm12,zmm13 + vpsrldq zmm7,zmm12,8 + vpslldq zmm8,zmm12,8 + vpxorq zmm1,zmm9,zmm7 + vpxorq zmm6,zmm10,zmm8 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm14,xmm15,xmm7,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_6 +$L$_AAD_blocks_15_6: + sub r12,1536 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 zmm3,ZMMWORD[64+r10] + vmovdqu8 zmm4,ZMMWORD[128+r10] + vmovdqu8 zmm5{k1}{z},[192+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpshufb zmm5,zmm5,zmm16 + vpxorq zmm11,zmm11,zmm14 + vmovdqu64 zmm15,ZMMWORD[112+rcx] + vpclmulqdq zmm1,zmm11,zmm15,0x11 + vpclmulqdq zmm6,zmm11,zmm15,0x00 + vpclmulqdq zmm7,zmm11,zmm15,0x01 + vpclmulqdq zmm8,zmm11,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[176+rcx] + vpclmulqdq zmm9,zmm3,zmm15,0x11 + vpclmulqdq zmm10,zmm3,zmm15,0x00 + vpclmulqdq zmm12,zmm3,zmm15,0x01 + vpclmulqdq zmm13,zmm3,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[240+rcx] + vpclmulqdq zmm11,zmm4,zmm15,0x11 + vpclmulqdq zmm3,zmm4,zmm15,0x00 + vpternlogq zmm9,zmm11,zmm1,0x96 + vpternlogq zmm10,zmm3,zmm6,0x96 + vpclmulqdq zmm11,zmm4,zmm15,0x01 + vpclmulqdq zmm3,zmm4,zmm15,0x10 + vpternlogq zmm12,zmm11,zmm7,0x96 + vpternlogq zmm13,zmm3,zmm8,0x96 + vmovdqu64 ymm15,YMMWORD[304+rcx] + vinserti64x2 zmm15,zmm15,ZMMWORD[336+rcx],2 + vpclmulqdq zmm7,zmm5,zmm15,0x01 + vpclmulqdq zmm8,zmm5,zmm15,0x10 + vpclmulqdq zmm1,zmm5,zmm15,0x11 + vpclmulqdq zmm6,zmm5,zmm15,0x00 + + vpxorq zmm7,zmm7,zmm12 + vpxorq zmm8,zmm8,zmm13 + vpxorq zmm1,zmm1,zmm9 + vpxorq zmm6,zmm6,zmm10 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm14,xmm15,xmm7,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_6 +$L$_AAD_blocks_14_6: + sub r12,1536 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 zmm3,ZMMWORD[64+r10] + vmovdqu8 zmm4,ZMMWORD[128+r10] + vmovdqu8 ymm5{k1}{z},[192+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpshufb ymm5,ymm5,ymm16 + vpxorq zmm11,zmm11,zmm14 + vmovdqu64 zmm15,ZMMWORD[128+rcx] + vpclmulqdq zmm1,zmm11,zmm15,0x11 + vpclmulqdq zmm6,zmm11,zmm15,0x00 + vpclmulqdq zmm7,zmm11,zmm15,0x01 + vpclmulqdq zmm8,zmm11,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[192+rcx] + vpclmulqdq zmm9,zmm3,zmm15,0x11 + vpclmulqdq zmm10,zmm3,zmm15,0x00 + vpclmulqdq zmm12,zmm3,zmm15,0x01 + vpclmulqdq zmm13,zmm3,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[256+rcx] + vpclmulqdq zmm11,zmm4,zmm15,0x11 + vpclmulqdq zmm3,zmm4,zmm15,0x00 + vpternlogq zmm9,zmm11,zmm1,0x96 + vpternlogq zmm10,zmm3,zmm6,0x96 + vpclmulqdq zmm11,zmm4,zmm15,0x01 + vpclmulqdq zmm3,zmm4,zmm15,0x10 + vpternlogq zmm12,zmm11,zmm7,0x96 + vpternlogq zmm13,zmm3,zmm8,0x96 + vmovdqu64 ymm15,YMMWORD[320+rcx] + vpclmulqdq ymm7,ymm5,ymm15,0x01 + vpclmulqdq ymm8,ymm5,ymm15,0x10 + vpclmulqdq ymm1,ymm5,ymm15,0x11 + vpclmulqdq ymm6,ymm5,ymm15,0x00 + + vpxorq zmm7,zmm7,zmm12 + vpxorq zmm8,zmm8,zmm13 + vpxorq zmm1,zmm1,zmm9 + vpxorq zmm6,zmm6,zmm10 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm14,xmm15,xmm7,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_6 +$L$_AAD_blocks_13_6: + sub r12,1536 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 zmm3,ZMMWORD[64+r10] + vmovdqu8 zmm4,ZMMWORD[128+r10] + vmovdqu8 xmm5{k1}{z},[192+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpshufb xmm5,xmm5,xmm16 + vpxorq zmm11,zmm11,zmm14 + vmovdqu64 zmm15,ZMMWORD[144+rcx] + vpclmulqdq zmm1,zmm11,zmm15,0x11 + vpclmulqdq zmm6,zmm11,zmm15,0x00 + vpclmulqdq zmm7,zmm11,zmm15,0x01 + vpclmulqdq zmm8,zmm11,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[208+rcx] + vpclmulqdq zmm9,zmm3,zmm15,0x11 + vpclmulqdq zmm10,zmm3,zmm15,0x00 + vpclmulqdq zmm12,zmm3,zmm15,0x01 + vpclmulqdq zmm13,zmm3,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[272+rcx] + vpclmulqdq zmm11,zmm4,zmm15,0x11 + vpclmulqdq zmm3,zmm4,zmm15,0x00 + vpternlogq zmm9,zmm11,zmm1,0x96 + vpternlogq zmm10,zmm3,zmm6,0x96 + vpclmulqdq zmm11,zmm4,zmm15,0x01 + vpclmulqdq zmm3,zmm4,zmm15,0x10 + vpternlogq zmm12,zmm11,zmm7,0x96 + vpternlogq zmm13,zmm3,zmm8,0x96 + vmovdqu64 xmm15,XMMWORD[336+rcx] + vpclmulqdq xmm7,xmm5,xmm15,0x01 + vpclmulqdq xmm8,xmm5,xmm15,0x10 + vpclmulqdq xmm1,xmm5,xmm15,0x11 + vpclmulqdq xmm6,xmm5,xmm15,0x00 + + vpxorq zmm7,zmm7,zmm12 + vpxorq zmm8,zmm8,zmm13 + vpxorq zmm1,zmm1,zmm9 + vpxorq zmm6,zmm6,zmm10 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm14,xmm15,xmm7,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_6 +$L$_AAD_blocks_12_6: + sub r12,1024 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 zmm3,ZMMWORD[64+r10] + vmovdqu8 zmm4{k1}{z},[128+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpxorq zmm11,zmm11,zmm14 + vmovdqu64 zmm15,ZMMWORD[160+rcx] + vpclmulqdq zmm1,zmm11,zmm15,0x11 + vpclmulqdq zmm6,zmm11,zmm15,0x00 + vpclmulqdq zmm7,zmm11,zmm15,0x01 + vpclmulqdq zmm8,zmm11,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[224+rcx] + vpclmulqdq zmm9,zmm3,zmm15,0x11 + vpclmulqdq zmm10,zmm3,zmm15,0x00 + vpclmulqdq zmm12,zmm3,zmm15,0x01 + vpclmulqdq zmm13,zmm3,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[288+rcx] + vpclmulqdq zmm11,zmm4,zmm15,0x11 + vpclmulqdq zmm3,zmm4,zmm15,0x00 + vpternlogq zmm9,zmm11,zmm1,0x96 + vpternlogq zmm10,zmm3,zmm6,0x96 + vpclmulqdq zmm11,zmm4,zmm15,0x01 + vpclmulqdq zmm3,zmm4,zmm15,0x10 + vpternlogq zmm12,zmm11,zmm7,0x96 + vpternlogq zmm13,zmm3,zmm8,0x96 + + vpxorq zmm12,zmm12,zmm13 + vpsrldq zmm7,zmm12,8 + vpslldq zmm8,zmm12,8 + vpxorq zmm1,zmm9,zmm7 + vpxorq zmm6,zmm10,zmm8 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm14,xmm15,xmm7,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_6 +$L$_AAD_blocks_11_6: + sub r12,1024 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 zmm3,ZMMWORD[64+r10] + vmovdqu8 zmm4{k1}{z},[128+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb zmm4,zmm4,zmm16 + vpxorq zmm11,zmm11,zmm14 + vmovdqu64 zmm15,ZMMWORD[176+rcx] + vpclmulqdq zmm1,zmm11,zmm15,0x11 + vpclmulqdq zmm6,zmm11,zmm15,0x00 + vpclmulqdq zmm7,zmm11,zmm15,0x01 + vpclmulqdq zmm8,zmm11,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[240+rcx] + vpclmulqdq zmm9,zmm3,zmm15,0x11 + vpclmulqdq zmm10,zmm3,zmm15,0x00 + vpclmulqdq zmm12,zmm3,zmm15,0x01 + vpclmulqdq zmm13,zmm3,zmm15,0x10 + vpxorq zmm9,zmm1,zmm9 + vpxorq zmm10,zmm6,zmm10 + vpxorq zmm12,zmm7,zmm12 + vpxorq zmm13,zmm8,zmm13 + vmovdqu64 ymm15,YMMWORD[304+rcx] + vinserti64x2 zmm15,zmm15,ZMMWORD[336+rcx],2 + vpclmulqdq zmm7,zmm4,zmm15,0x01 + vpclmulqdq zmm8,zmm4,zmm15,0x10 + vpclmulqdq zmm1,zmm4,zmm15,0x11 + vpclmulqdq zmm6,zmm4,zmm15,0x00 + + vpxorq zmm7,zmm7,zmm12 + vpxorq zmm8,zmm8,zmm13 + vpxorq zmm1,zmm1,zmm9 + vpxorq zmm6,zmm6,zmm10 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm14,xmm15,xmm7,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_6 +$L$_AAD_blocks_10_6: + sub r12,1024 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 zmm3,ZMMWORD[64+r10] + vmovdqu8 ymm4{k1}{z},[128+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb ymm4,ymm4,ymm16 + vpxorq zmm11,zmm11,zmm14 + vmovdqu64 zmm15,ZMMWORD[192+rcx] + vpclmulqdq zmm1,zmm11,zmm15,0x11 + vpclmulqdq zmm6,zmm11,zmm15,0x00 + vpclmulqdq zmm7,zmm11,zmm15,0x01 + vpclmulqdq zmm8,zmm11,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[256+rcx] + vpclmulqdq zmm9,zmm3,zmm15,0x11 + vpclmulqdq zmm10,zmm3,zmm15,0x00 + vpclmulqdq zmm12,zmm3,zmm15,0x01 + vpclmulqdq zmm13,zmm3,zmm15,0x10 + vpxorq zmm9,zmm1,zmm9 + vpxorq zmm10,zmm6,zmm10 + vpxorq zmm12,zmm7,zmm12 + vpxorq zmm13,zmm8,zmm13 + vmovdqu64 ymm15,YMMWORD[320+rcx] + vpclmulqdq ymm7,ymm4,ymm15,0x01 + vpclmulqdq ymm8,ymm4,ymm15,0x10 + vpclmulqdq ymm1,ymm4,ymm15,0x11 + vpclmulqdq ymm6,ymm4,ymm15,0x00 + + vpxorq zmm7,zmm7,zmm12 + vpxorq zmm8,zmm8,zmm13 + vpxorq zmm1,zmm1,zmm9 + vpxorq zmm6,zmm6,zmm10 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm14,xmm15,xmm7,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_6 +$L$_AAD_blocks_9_6: + sub r12,1024 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 zmm3,ZMMWORD[64+r10] + vmovdqu8 xmm4{k1}{z},[128+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpshufb xmm4,xmm4,xmm16 + vpxorq zmm11,zmm11,zmm14 + vmovdqu64 zmm15,ZMMWORD[208+rcx] + vpclmulqdq zmm1,zmm11,zmm15,0x11 + vpclmulqdq zmm6,zmm11,zmm15,0x00 + vpclmulqdq zmm7,zmm11,zmm15,0x01 + vpclmulqdq zmm8,zmm11,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[272+rcx] + vpclmulqdq zmm9,zmm3,zmm15,0x11 + vpclmulqdq zmm10,zmm3,zmm15,0x00 + vpclmulqdq zmm12,zmm3,zmm15,0x01 + vpclmulqdq zmm13,zmm3,zmm15,0x10 + vpxorq zmm9,zmm1,zmm9 + vpxorq zmm10,zmm6,zmm10 + vpxorq zmm12,zmm7,zmm12 + vpxorq zmm13,zmm8,zmm13 + vmovdqu64 xmm15,XMMWORD[336+rcx] + vpclmulqdq xmm7,xmm4,xmm15,0x01 + vpclmulqdq xmm8,xmm4,xmm15,0x10 + vpclmulqdq xmm1,xmm4,xmm15,0x11 + vpclmulqdq xmm6,xmm4,xmm15,0x00 + + vpxorq zmm7,zmm7,zmm12 + vpxorq zmm8,zmm8,zmm13 + vpxorq zmm1,zmm1,zmm9 + vpxorq zmm6,zmm6,zmm10 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm14,xmm15,xmm7,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_6 +$L$_AAD_blocks_8_6: + sub r12,512 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 zmm3{k1}{z},[64+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpxorq zmm11,zmm11,zmm14 + vmovdqu64 zmm15,ZMMWORD[224+rcx] + vpclmulqdq zmm1,zmm11,zmm15,0x11 + vpclmulqdq zmm6,zmm11,zmm15,0x00 + vpclmulqdq zmm7,zmm11,zmm15,0x01 + vpclmulqdq zmm8,zmm11,zmm15,0x10 + vmovdqu64 zmm15,ZMMWORD[288+rcx] + vpclmulqdq zmm9,zmm3,zmm15,0x11 + vpclmulqdq zmm10,zmm3,zmm15,0x00 + vpclmulqdq zmm12,zmm3,zmm15,0x01 + vpclmulqdq zmm13,zmm3,zmm15,0x10 + vpxorq zmm9,zmm1,zmm9 + vpxorq zmm10,zmm6,zmm10 + vpxorq zmm12,zmm7,zmm12 + vpxorq zmm13,zmm8,zmm13 + + vpxorq zmm12,zmm12,zmm13 + vpsrldq zmm7,zmm12,8 + vpslldq zmm8,zmm12,8 + vpxorq zmm1,zmm9,zmm7 + vpxorq zmm6,zmm10,zmm8 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm14,xmm15,xmm7,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_6 +$L$_AAD_blocks_7_6: + sub r12,512 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 zmm3{k1}{z},[64+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb zmm3,zmm3,zmm16 + vpxorq zmm11,zmm11,zmm14 + vmovdqu64 zmm15,ZMMWORD[240+rcx] + vpclmulqdq zmm9,zmm11,zmm15,0x11 + vpclmulqdq zmm10,zmm11,zmm15,0x00 + vpclmulqdq zmm12,zmm11,zmm15,0x01 + vpclmulqdq zmm13,zmm11,zmm15,0x10 + vmovdqu64 ymm15,YMMWORD[304+rcx] + vinserti64x2 zmm15,zmm15,ZMMWORD[336+rcx],2 + vpclmulqdq zmm7,zmm3,zmm15,0x01 + vpclmulqdq zmm8,zmm3,zmm15,0x10 + vpclmulqdq zmm1,zmm3,zmm15,0x11 + vpclmulqdq zmm6,zmm3,zmm15,0x00 + + vpxorq zmm7,zmm7,zmm12 + vpxorq zmm8,zmm8,zmm13 + vpxorq zmm1,zmm1,zmm9 + vpxorq zmm6,zmm6,zmm10 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm14,xmm15,xmm7,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_6 +$L$_AAD_blocks_6_6: + sub r12,512 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 ymm3{k1}{z},[64+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb ymm3,ymm3,ymm16 + vpxorq zmm11,zmm11,zmm14 + vmovdqu64 zmm15,ZMMWORD[256+rcx] + vpclmulqdq zmm9,zmm11,zmm15,0x11 + vpclmulqdq zmm10,zmm11,zmm15,0x00 + vpclmulqdq zmm12,zmm11,zmm15,0x01 + vpclmulqdq zmm13,zmm11,zmm15,0x10 + vmovdqu64 ymm15,YMMWORD[320+rcx] + vpclmulqdq ymm7,ymm3,ymm15,0x01 + vpclmulqdq ymm8,ymm3,ymm15,0x10 + vpclmulqdq ymm1,ymm3,ymm15,0x11 + vpclmulqdq ymm6,ymm3,ymm15,0x00 + + vpxorq zmm7,zmm7,zmm12 + vpxorq zmm8,zmm8,zmm13 + vpxorq zmm1,zmm1,zmm9 + vpxorq zmm6,zmm6,zmm10 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm14,xmm15,xmm7,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_6 +$L$_AAD_blocks_5_6: + sub r12,512 + kmovq k1,[r12] + vmovdqu8 zmm11,ZMMWORD[r10] + vmovdqu8 xmm3{k1}{z},[64+r10] + vpshufb zmm11,zmm11,zmm16 + vpshufb xmm3,xmm3,xmm16 + vpxorq zmm11,zmm11,zmm14 + vmovdqu64 zmm15,ZMMWORD[272+rcx] + vpclmulqdq zmm9,zmm11,zmm15,0x11 + vpclmulqdq zmm10,zmm11,zmm15,0x00 + vpclmulqdq zmm12,zmm11,zmm15,0x01 + vpclmulqdq zmm13,zmm11,zmm15,0x10 + vmovdqu64 xmm15,XMMWORD[336+rcx] + vpclmulqdq xmm7,xmm3,xmm15,0x01 + vpclmulqdq xmm8,xmm3,xmm15,0x10 + vpclmulqdq xmm1,xmm3,xmm15,0x11 + vpclmulqdq xmm6,xmm3,xmm15,0x00 + + vpxorq zmm7,zmm7,zmm12 + vpxorq zmm8,zmm8,zmm13 + vpxorq zmm1,zmm1,zmm9 + vpxorq zmm6,zmm6,zmm10 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm14,xmm15,xmm7,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_6 +$L$_AAD_blocks_4_6: + kmovq k1,[r12] + vmovdqu8 zmm11{k1}{z},[r10] + vpshufb zmm11,zmm11,zmm16 + vpxorq zmm11,zmm11,zmm14 + vmovdqu64 zmm15,ZMMWORD[288+rcx] + vpclmulqdq zmm9,zmm11,zmm15,0x11 + vpclmulqdq zmm10,zmm11,zmm15,0x00 + vpclmulqdq zmm12,zmm11,zmm15,0x01 + vpclmulqdq zmm13,zmm11,zmm15,0x10 + + vpxorq zmm12,zmm12,zmm13 + vpsrldq zmm7,zmm12,8 + vpslldq zmm8,zmm12,8 + vpxorq zmm1,zmm9,zmm7 + vpxorq zmm6,zmm10,zmm8 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm14,xmm15,xmm7,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_6 +$L$_AAD_blocks_3_6: + kmovq k1,[r12] + vmovdqu8 zmm11{k1}{z},[r10] + vpshufb zmm11,zmm11,zmm16 + vpxorq zmm11,zmm11,zmm14 + vmovdqu64 ymm15,YMMWORD[304+rcx] + vinserti64x2 zmm15,zmm15,ZMMWORD[336+rcx],2 + vpclmulqdq zmm7,zmm11,zmm15,0x01 + vpclmulqdq zmm8,zmm11,zmm15,0x10 + vpclmulqdq zmm1,zmm11,zmm15,0x11 + vpclmulqdq zmm6,zmm11,zmm15,0x00 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm14,xmm15,xmm7,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_6 +$L$_AAD_blocks_2_6: + kmovq k1,[r12] + vmovdqu8 ymm11{k1}{z},[r10] + vpshufb ymm11,ymm11,ymm16 + vpxorq zmm11,zmm11,zmm14 + vmovdqu64 ymm15,YMMWORD[320+rcx] + vpclmulqdq ymm7,ymm11,ymm15,0x01 + vpclmulqdq ymm8,ymm11,ymm15,0x10 + vpclmulqdq ymm1,ymm11,ymm15,0x11 + vpclmulqdq ymm6,ymm11,ymm15,0x00 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm14,xmm15,xmm7,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm8,xmm1,0x96 + + jmp NEAR $L$_CALC_AAD_done_6 +$L$_AAD_blocks_1_6: + kmovq k1,[r12] + vmovdqu8 xmm11{k1}{z},[r10] + vpshufb xmm11,xmm11,xmm16 + vpxorq zmm11,zmm11,zmm14 + vmovdqu64 xmm15,XMMWORD[336+rcx] + vpclmulqdq xmm7,xmm11,xmm15,0x01 + vpclmulqdq xmm8,xmm11,xmm15,0x10 + vpclmulqdq xmm1,xmm11,xmm15,0x11 + vpclmulqdq xmm6,xmm11,xmm15,0x00 + + vpxorq zmm7,zmm7,zmm8 + vpsrldq zmm12,zmm7,8 + vpslldq zmm13,zmm7,8 + vpxorq zmm1,zmm1,zmm12 + vpxorq zmm6,zmm6,zmm13 + vextracti64x4 ymm12,zmm1,1 + vpxorq ymm1,ymm1,ymm12 + vextracti32x4 xmm12,ymm1,1 + vpxorq xmm1,xmm1,xmm12 + vextracti64x4 ymm13,zmm6,1 + vpxorq ymm6,ymm6,ymm13 + vextracti32x4 xmm13,ymm6,1 + vpxorq xmm6,xmm6,xmm13 + vmovdqa64 xmm15,XMMWORD[POLY2] + + + vpclmulqdq xmm7,xmm15,xmm6,0x01 + vpslldq xmm7,xmm7,8 + vpxorq xmm7,xmm6,xmm7 + + + vpclmulqdq xmm8,xmm15,xmm7,0x00 + vpsrldq xmm8,xmm8,4 + vpclmulqdq xmm14,xmm15,xmm7,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm8,xmm1,0x96 + +$L$_CALC_AAD_done_6: + vmovdqu64 XMMWORD[64+rcx],xmm14 + cmp r8,256 + jbe NEAR $L$skip_hkeys_cleanup_9 + vpxor xmm0,xmm0,xmm0 + vmovdqa64 ZMMWORD[rsp],zmm0 + vmovdqa64 ZMMWORD[64+rsp],zmm0 + vmovdqa64 ZMMWORD[128+rsp],zmm0 + vmovdqa64 ZMMWORD[192+rsp],zmm0 + vmovdqa64 ZMMWORD[256+rsp],zmm0 + vmovdqa64 ZMMWORD[320+rsp],zmm0 + vmovdqa64 ZMMWORD[384+rsp],zmm0 + vmovdqa64 ZMMWORD[448+rsp],zmm0 + vmovdqa64 ZMMWORD[512+rsp],zmm0 + vmovdqa64 ZMMWORD[576+rsp],zmm0 + vmovdqa64 ZMMWORD[640+rsp],zmm0 + vmovdqa64 ZMMWORD[704+rsp],zmm0 +$L$skip_hkeys_cleanup_9: + vzeroupper + vmovdqu xmm15,XMMWORD[((-16))+rbp] + vmovdqu xmm14,XMMWORD[((-32))+rbp] + vmovdqu xmm13,XMMWORD[((-48))+rbp] + vmovdqu xmm12,XMMWORD[((-64))+rbp] + vmovdqu xmm11,XMMWORD[((-80))+rbp] + vmovdqu xmm10,XMMWORD[((-96))+rbp] + vmovdqu xmm9,XMMWORD[((-112))+rbp] + vmovdqu xmm8,XMMWORD[((-128))+rbp] + vmovdqu xmm7,XMMWORD[((-144))+rbp] + vmovdqu xmm6,XMMWORD[((-160))+rbp] + lea rsp,[8+rbp] + pop rsi + + pop rdi + + pop r15 + + pop r14 + + pop r13 + + pop r12 + + pop rbp + + pop rbx + +$L$exit_update_aad: + DB 0F3h,0C3h ;repret +$L$ghash_seh_end: + + +global ossl_aes_gcm_encrypt_avx512 + +ALIGN 32 +ossl_aes_gcm_encrypt_avx512: + +$L$encrypt_seh_begin: +DB 243,15,30,250 + push rbx + +$L$encrypt_seh_push_rbx: + push rbp + +$L$encrypt_seh_push_rbp: + push r12 + +$L$encrypt_seh_push_r12: + push r13 + +$L$encrypt_seh_push_r13: + push r14 + +$L$encrypt_seh_push_r14: + push r15 + +$L$encrypt_seh_push_r15: + push rdi +$L$encrypt_seh_push_rdi: + push rsi +$L$encrypt_seh_push_rsi: + + sub rsp,168 +$L$encrypt_seh_allocstack_xmm: + + + + + + + + + + + lea rbp,[160+rsp] + +$L$encrypt_seh_setfp: + vmovdqu XMMWORD[rsp],xmm6 +$L$encrypt_seh_save_xmm6: + vmovdqu XMMWORD[16+rsp],xmm7 +$L$encrypt_seh_save_xmm7: + vmovdqu XMMWORD[32+rsp],xmm8 +$L$encrypt_seh_save_xmm8: + vmovdqu XMMWORD[48+rsp],xmm9 +$L$encrypt_seh_save_xmm9: + vmovdqu XMMWORD[64+rsp],xmm10 +$L$encrypt_seh_save_xmm10: + vmovdqu XMMWORD[80+rsp],xmm11 +$L$encrypt_seh_save_xmm11: + vmovdqu XMMWORD[96+rsp],xmm12 +$L$encrypt_seh_save_xmm12: + vmovdqu XMMWORD[112+rsp],xmm13 +$L$encrypt_seh_save_xmm13: + vmovdqu XMMWORD[128+rsp],xmm14 +$L$encrypt_seh_save_xmm14: + vmovdqu XMMWORD[144+rsp],xmm15 +$L$encrypt_seh_save_xmm15: + +$L$encrypt_seh_prolog_end: + sub rsp,1584 + and rsp,(-64) + + + mov eax,DWORD[240+rcx] + cmp eax,9 + je NEAR $L$aes_gcm_encrypt_128_avx512 + cmp eax,11 + je NEAR $L$aes_gcm_encrypt_192_avx512 + cmp eax,13 + je NEAR $L$aes_gcm_encrypt_256_avx512 + xor eax,eax + jmp NEAR $L$exit_gcm_encrypt +ALIGN 32 +$L$aes_gcm_encrypt_128_avx512: + cmp QWORD[112+rbp],0 + je NEAR $L$_enc_dec_done_10 + xor r14,r14 + vmovdqu64 xmm14,XMMWORD[64+rdx] + + mov r11,QWORD[r8] + or r11,r11 + je NEAR $L$_partial_block_done_11 + mov r10d,16 + lea r12,[byte_len_to_mask_table] + cmp QWORD[112+rbp],r10 + cmovc r10,QWORD[112+rbp] + add r12,r10 + add r12,r10 + kmovw k1,[r12] + vmovdqu8 xmm0{k1}{z},[r9] + + vmovdqu64 xmm3,XMMWORD[16+rdx] + vmovdqu64 xmm4,XMMWORD[336+rdx] + + + + lea r12,[SHIFT_MASK] + add r12,r11 + vmovdqu64 xmm5,XMMWORD[r12] + vpshufb xmm3,xmm3,xmm5 + vpxorq xmm3,xmm3,xmm0 + + + mov r13,QWORD[112+rbp] + add r13,r11 + sub r13,16 + jge NEAR $L$_no_extra_mask_11 + sub r12,r13 +$L$_no_extra_mask_11: + + + + vmovdqu64 xmm0,XMMWORD[16+r12] + vpand xmm3,xmm3,xmm0 + vpshufb xmm3,xmm3,XMMWORD[SHUF_MASK] + vpshufb xmm3,xmm3,xmm5 + vpxorq xmm14,xmm14,xmm3 + cmp r13,0 + jl NEAR $L$_partial_incomplete_11 + + vpclmulqdq xmm7,xmm14,xmm4,0x11 + vpclmulqdq xmm10,xmm14,xmm4,0x00 + vpclmulqdq xmm11,xmm14,xmm4,0x01 + vpclmulqdq xmm14,xmm14,xmm4,0x10 + vpxorq xmm14,xmm14,xmm11 + + vpsrldq xmm11,xmm14,8 + vpslldq xmm14,xmm14,8 + vpxorq xmm7,xmm7,xmm11 + vpxorq xmm14,xmm14,xmm10 + + + + vmovdqu64 xmm11,XMMWORD[POLY2] + + vpclmulqdq xmm10,xmm11,xmm14,0x01 + vpslldq xmm10,xmm10,8 + vpxorq xmm14,xmm14,xmm10 + + + + vpclmulqdq xmm10,xmm11,xmm14,0x00 + vpsrldq xmm10,xmm10,4 + vpclmulqdq xmm14,xmm11,xmm14,0x10 + vpslldq xmm14,xmm14,4 + + vpternlogq xmm14,xmm7,xmm10,0x96 + + mov QWORD[r8],0 + + mov r12,r11 + mov r11,16 + sub r11,r12 + jmp NEAR $L$_enc_dec_done_11 + +$L$_partial_incomplete_11: + mov r12,QWORD[112+rbp] + add QWORD[r8],r12 + mov r11,QWORD[112+rbp] + +$L$_enc_dec_done_11: + + + lea r12,[byte_len_to_mask_table] + kmovw k1,[r11*2+r12] + vmovdqu64 XMMWORD[64+rdx],xmm14 + + vpshufb xmm3,xmm3,XMMWORD[SHUF_MASK] + vpshufb xmm3,xmm3,xmm5 + mov r12,QWORD[120+rbp] + vmovdqu8 XMMWORD[r12]{k1},xmm3 +$L$_partial_block_done_11: + vmovdqu64 xmm2,XMMWORD[rdx] + mov r13,QWORD[112+rbp] + sub r13,r11 + je NEAR $L$_enc_dec_done_10 + cmp r13,256 + jbe NEAR $L$_message_below_equal_16_blocks_10 + + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vmovdqa64 zmm27,ZMMWORD[ddq_addbe_4444] + vmovdqa64 zmm28,ZMMWORD[ddq_addbe_1234] + + + + + + + vmovd r15d,xmm2 + and r15d,255 + + vshufi64x2 zmm2,zmm2,zmm2,0 + vpshufb zmm2,zmm2,zmm29 + + + + cmp r15b,240 + jae NEAR $L$_next_16_overflow_12 + vpaddd zmm7,zmm2,zmm28 + vpaddd zmm10,zmm7,zmm27 + vpaddd zmm11,zmm10,zmm27 + vpaddd zmm12,zmm11,zmm27 + jmp NEAR $L$_next_16_ok_12 +$L$_next_16_overflow_12: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm12,ZMMWORD[ddq_add_4444] + vpaddd zmm7,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm10,zmm7,zmm12 + vpaddd zmm11,zmm10,zmm12 + vpaddd zmm12,zmm11,zmm12 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vpshufb zmm12,zmm12,zmm29 +$L$_next_16_ok_12: + vshufi64x2 zmm2,zmm12,zmm12,255 + add r15b,16 + + vmovdqu8 zmm0,ZMMWORD[r11*1+r9] + vmovdqu8 zmm3,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm4,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm5,ZMMWORD[192+r11*1+r9] + + + vbroadcastf64x2 zmm6,ZMMWORD[rcx] + vpxorq zmm7,zmm7,zmm6 + vpxorq zmm10,zmm10,zmm6 + vpxorq zmm11,zmm11,zmm6 + vpxorq zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[16+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[32+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[48+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[64+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[80+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[96+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[112+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[128+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[144+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[160+rcx] + vaesenclast zmm7,zmm7,zmm6 + vaesenclast zmm10,zmm10,zmm6 + vaesenclast zmm11,zmm11,zmm6 + vaesenclast zmm12,zmm12,zmm6 + + + vpxorq zmm7,zmm7,zmm0 + vpxorq zmm10,zmm10,zmm3 + vpxorq zmm11,zmm11,zmm4 + vpxorq zmm12,zmm12,zmm5 + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm7 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm10 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm11 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm12 + + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vpshufb zmm12,zmm12,zmm29 + vmovdqa64 ZMMWORD[768+rsp],zmm7 + vmovdqa64 ZMMWORD[832+rsp],zmm10 + vmovdqa64 ZMMWORD[896+rsp],zmm11 + vmovdqa64 ZMMWORD[960+rsp],zmm12 + test r14,r14 + jnz NEAR $L$_skip_hkeys_precomputation_13 + + vmovdqu64 zmm0,ZMMWORD[288+rdx] + vmovdqu64 ZMMWORD[704+rsp],zmm0 + + vmovdqu64 zmm3,ZMMWORD[224+rdx] + vmovdqu64 ZMMWORD[640+rsp],zmm3 + + + vshufi64x2 zmm3,zmm3,zmm3,0x00 + + vmovdqu64 zmm4,ZMMWORD[160+rdx] + vmovdqu64 ZMMWORD[576+rsp],zmm4 + + vmovdqu64 zmm5,ZMMWORD[96+rdx] + vmovdqu64 ZMMWORD[512+rsp],zmm5 +$L$_skip_hkeys_precomputation_13: + cmp r13,512 + jb NEAR $L$_message_below_32_blocks_10 + + + + cmp r15b,240 + jae NEAR $L$_next_16_overflow_14 + vpaddd zmm7,zmm2,zmm28 + vpaddd zmm10,zmm7,zmm27 + vpaddd zmm11,zmm10,zmm27 + vpaddd zmm12,zmm11,zmm27 + jmp NEAR $L$_next_16_ok_14 +$L$_next_16_overflow_14: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm12,ZMMWORD[ddq_add_4444] + vpaddd zmm7,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm10,zmm7,zmm12 + vpaddd zmm11,zmm10,zmm12 + vpaddd zmm12,zmm11,zmm12 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vpshufb zmm12,zmm12,zmm29 +$L$_next_16_ok_14: + vshufi64x2 zmm2,zmm12,zmm12,255 + add r15b,16 + + vmovdqu8 zmm0,ZMMWORD[256+r11*1+r9] + vmovdqu8 zmm3,ZMMWORD[320+r11*1+r9] + vmovdqu8 zmm4,ZMMWORD[384+r11*1+r9] + vmovdqu8 zmm5,ZMMWORD[448+r11*1+r9] + + + vbroadcastf64x2 zmm6,ZMMWORD[rcx] + vpxorq zmm7,zmm7,zmm6 + vpxorq zmm10,zmm10,zmm6 + vpxorq zmm11,zmm11,zmm6 + vpxorq zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[16+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[32+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[48+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[64+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[80+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[96+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[112+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[128+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[144+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[160+rcx] + vaesenclast zmm7,zmm7,zmm6 + vaesenclast zmm10,zmm10,zmm6 + vaesenclast zmm11,zmm11,zmm6 + vaesenclast zmm12,zmm12,zmm6 + + + vpxorq zmm7,zmm7,zmm0 + vpxorq zmm10,zmm10,zmm3 + vpxorq zmm11,zmm11,zmm4 + vpxorq zmm12,zmm12,zmm5 + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[256+r11*1+r10],zmm7 + vmovdqu8 ZMMWORD[320+r11*1+r10],zmm10 + vmovdqu8 ZMMWORD[384+r11*1+r10],zmm11 + vmovdqu8 ZMMWORD[448+r11*1+r10],zmm12 + + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vpshufb zmm12,zmm12,zmm29 + vmovdqa64 ZMMWORD[1024+rsp],zmm7 + vmovdqa64 ZMMWORD[1088+rsp],zmm10 + vmovdqa64 ZMMWORD[1152+rsp],zmm11 + vmovdqa64 ZMMWORD[1216+rsp],zmm12 + test r14,r14 + jnz NEAR $L$_skip_hkeys_precomputation_15 + vmovdqu64 zmm3,ZMMWORD[640+rsp] + + + vshufi64x2 zmm3,zmm3,zmm3,0x00 + + vmovdqu64 zmm4,ZMMWORD[576+rsp] + vmovdqu64 zmm5,ZMMWORD[512+rsp] + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[448+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[384+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[320+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[256+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[192+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[128+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[64+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[rsp],zmm5 +$L$_skip_hkeys_precomputation_15: + mov r14,1 + add r11,512 + sub r13,512 + + cmp r13,768 + jb NEAR $L$_no_more_big_nblocks_10 +$L$_encrypt_big_nblocks_10: + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_16 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_16 +$L$_16_blocks_overflow_16: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_16: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[192+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm26,zmm10,zmm15 + vpxorq zmm24,zmm6,zmm12 + vpxorq zmm25,zmm7,zmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqa64 ZMMWORD[1280+rsp],zmm0 + vmovdqa64 ZMMWORD[1344+rsp],zmm3 + vmovdqa64 ZMMWORD[1408+rsp],zmm4 + vmovdqa64 ZMMWORD[1472+rsp],zmm5 + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_17 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_17 +$L$_16_blocks_overflow_17: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_17: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[256+rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[320+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[384+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[448+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[256+r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[320+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[384+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[448+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vpternlogq zmm24,zmm6,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[256+r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[320+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[384+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[448+r11*1+r10],zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqa64 ZMMWORD[768+rsp],zmm0 + vmovdqa64 ZMMWORD[832+rsp],zmm3 + vmovdqa64 ZMMWORD[896+rsp],zmm4 + vmovdqa64 ZMMWORD[960+rsp],zmm5 + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_18 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_18 +$L$_16_blocks_overflow_18: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_18: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[512+r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[576+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[640+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[704+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + + + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpternlogq zmm6,zmm12,zmm15,0x96 + vpxorq zmm6,zmm6,zmm24 + vpternlogq zmm7,zmm13,zmm10,0x96 + vpxorq zmm7,zmm7,zmm25 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vextracti64x4 ymm12,zmm6,1 + vpxorq ymm6,ymm6,ymm12 + vextracti32x4 xmm12,ymm6,1 + vpxorq xmm6,xmm6,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm6,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[512+r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[576+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[640+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[704+r11*1+r10],zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqa64 ZMMWORD[1024+rsp],zmm0 + vmovdqa64 ZMMWORD[1088+rsp],zmm3 + vmovdqa64 ZMMWORD[1152+rsp],zmm4 + vmovdqa64 ZMMWORD[1216+rsp],zmm5 + vmovdqa64 zmm14,zmm6 + + add r11,768 + sub r13,768 + cmp r13,768 + jae NEAR $L$_encrypt_big_nblocks_10 + +$L$_no_more_big_nblocks_10: + + cmp r13,512 + jae NEAR $L$_encrypt_32_blocks_10 + + cmp r13,256 + jae NEAR $L$_encrypt_16_blocks_10 +$L$_encrypt_0_blocks_ghash_32_10: + mov r10d,r13d + and r10d,~15 + mov ebx,256 + sub ebx,r10d + vmovdqa64 zmm13,ZMMWORD[768+rsp] + vpxorq zmm13,zmm13,zmm14 + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[832+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpxorq zmm26,zmm4,zmm10 + vpxorq zmm24,zmm0,zmm6 + vpxorq zmm25,zmm3,zmm7 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[896+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[960+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + add ebx,256 + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_19 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_19 + jb NEAR $L$_last_num_blocks_is_7_1_19 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_19 + jb NEAR $L$_last_num_blocks_is_11_9_19 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_19 + ja NEAR $L$_last_num_blocks_is_16_19 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_19 + jmp NEAR $L$_last_num_blocks_is_13_19 + +$L$_last_num_blocks_is_11_9_19: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_19 + ja NEAR $L$_last_num_blocks_is_11_19 + jmp NEAR $L$_last_num_blocks_is_9_19 + +$L$_last_num_blocks_is_7_1_19: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_19 + jb NEAR $L$_last_num_blocks_is_3_1_19 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_19 + je NEAR $L$_last_num_blocks_is_6_19 + jmp NEAR $L$_last_num_blocks_is_5_19 + +$L$_last_num_blocks_is_3_1_19: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_19 + je NEAR $L$_last_num_blocks_is_2_19 +$L$_last_num_blocks_is_1_19: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_20 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_20 + +$L$_16_blocks_overflow_20: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_20: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm31 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb xmm17,xmm0,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_21 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_21 +$L$_small_initial_partial_block_21: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm0,XMMWORD[POLY2] + + + vpclmulqdq xmm3,xmm0,xmm25,0x01 + vpslldq xmm3,xmm3,8 + vpxorq xmm3,xmm25,xmm3 + + + vpclmulqdq xmm4,xmm0,xmm3,0x00 + vpsrldq xmm4,xmm4,4 + vpclmulqdq xmm14,xmm0,xmm3,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm4,xmm24,0x96 + + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_21 +$L$_small_initial_compute_done_21: +$L$_after_reduction_21: + jmp NEAR $L$_last_blocks_done_19 +$L$_last_num_blocks_is_2_19: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_22 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_22 + +$L$_16_blocks_overflow_22: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_22: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm31 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb ymm17,ymm0,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_23 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_23 +$L$_small_initial_partial_block_23: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_23: + + or r13,r13 + je NEAR $L$_after_reduction_23 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_23: + jmp NEAR $L$_last_blocks_done_19 +$L$_last_num_blocks_is_3_19: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_24 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_24 + +$L$_16_blocks_overflow_24: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_24: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_25 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_25 +$L$_small_initial_partial_block_25: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_25: + + or r13,r13 + je NEAR $L$_after_reduction_25 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_25: + jmp NEAR $L$_last_blocks_done_19 +$L$_last_num_blocks_is_4_19: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_26 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_26 + +$L$_16_blocks_overflow_26: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_26: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_27 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_27 +$L$_small_initial_partial_block_27: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_27: + + or r13,r13 + je NEAR $L$_after_reduction_27 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_27: + jmp NEAR $L$_last_blocks_done_19 +$L$_last_num_blocks_is_5_19: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_28 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_28 + +$L$_16_blocks_overflow_28: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_28: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb xmm19,xmm3,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_29 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_29 +$L$_small_initial_partial_block_29: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_29: + + or r13,r13 + je NEAR $L$_after_reduction_29 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_29: + jmp NEAR $L$_last_blocks_done_19 +$L$_last_num_blocks_is_6_19: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_30 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_30 + +$L$_16_blocks_overflow_30: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_30: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb ymm19,ymm3,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_31 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_31 +$L$_small_initial_partial_block_31: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_31: + + or r13,r13 + je NEAR $L$_after_reduction_31 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_31: + jmp NEAR $L$_last_blocks_done_19 +$L$_last_num_blocks_is_7_19: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_32 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_32 + +$L$_16_blocks_overflow_32: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_32: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_33 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_33 +$L$_small_initial_partial_block_33: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_33: + + or r13,r13 + je NEAR $L$_after_reduction_33 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_33: + jmp NEAR $L$_last_blocks_done_19 +$L$_last_num_blocks_is_8_19: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_34 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_34 + +$L$_16_blocks_overflow_34: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_34: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_35 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_35 +$L$_small_initial_partial_block_35: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_35: + + or r13,r13 + je NEAR $L$_after_reduction_35 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_35: + jmp NEAR $L$_last_blocks_done_19 +$L$_last_num_blocks_is_9_19: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_36 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_36 + +$L$_16_blocks_overflow_36: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_36: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb xmm20,xmm4,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_37 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_37 +$L$_small_initial_partial_block_37: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_37: + + or r13,r13 + je NEAR $L$_after_reduction_37 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_37: + jmp NEAR $L$_last_blocks_done_19 +$L$_last_num_blocks_is_10_19: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_38 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_38 + +$L$_16_blocks_overflow_38: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_38: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb ymm20,ymm4,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_39 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_39 +$L$_small_initial_partial_block_39: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_39: + + or r13,r13 + je NEAR $L$_after_reduction_39 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_39: + jmp NEAR $L$_last_blocks_done_19 +$L$_last_num_blocks_is_11_19: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_40 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_40 + +$L$_16_blocks_overflow_40: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_40: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_41 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_41 +$L$_small_initial_partial_block_41: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_41: + + or r13,r13 + je NEAR $L$_after_reduction_41 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_41: + jmp NEAR $L$_last_blocks_done_19 +$L$_last_num_blocks_is_12_19: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_42 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_42 + +$L$_16_blocks_overflow_42: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_42: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_43 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_43 +$L$_small_initial_partial_block_43: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_43: + + or r13,r13 + je NEAR $L$_after_reduction_43 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_43: + jmp NEAR $L$_last_blocks_done_19 +$L$_last_num_blocks_is_13_19: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_44 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_44 + +$L$_16_blocks_overflow_44: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_44: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb xmm21,xmm5,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_45 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_45 +$L$_small_initial_partial_block_45: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_45: + + or r13,r13 + je NEAR $L$_after_reduction_45 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_45: + jmp NEAR $L$_last_blocks_done_19 +$L$_last_num_blocks_is_14_19: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_46 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_46 + +$L$_16_blocks_overflow_46: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_46: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb ymm21,ymm5,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_47 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_47 +$L$_small_initial_partial_block_47: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_47: + + or r13,r13 + je NEAR $L$_after_reduction_47 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_47: + jmp NEAR $L$_last_blocks_done_19 +$L$_last_num_blocks_is_15_19: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_48 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_48 + +$L$_16_blocks_overflow_48: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_48: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_49 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_49 +$L$_small_initial_partial_block_49: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_49: + + or r13,r13 + je NEAR $L$_after_reduction_49 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_49: + jmp NEAR $L$_last_blocks_done_19 +$L$_last_num_blocks_is_16_19: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_50 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_50 + +$L$_16_blocks_overflow_50: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_50: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_51: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_51: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_51: + jmp NEAR $L$_last_blocks_done_19 +$L$_last_num_blocks_is_0_19: + vmovdqa64 zmm13,ZMMWORD[1024+rsp] + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1088+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1152+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1216+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_19: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_10 +$L$_encrypt_32_blocks_10: + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_52 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_52 +$L$_16_blocks_overflow_52: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_52: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[192+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm26,zmm10,zmm15 + vpxorq zmm24,zmm6,zmm12 + vpxorq zmm25,zmm7,zmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqa64 ZMMWORD[1280+rsp],zmm0 + vmovdqa64 ZMMWORD[1344+rsp],zmm3 + vmovdqa64 ZMMWORD[1408+rsp],zmm4 + vmovdqa64 ZMMWORD[1472+rsp],zmm5 + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_53 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_53 +$L$_16_blocks_overflow_53: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_53: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[256+rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[320+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[384+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[448+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[256+r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[320+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[384+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[448+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vpternlogq zmm24,zmm6,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[256+r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[320+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[384+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[448+r11*1+r10],zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqa64 ZMMWORD[768+rsp],zmm0 + vmovdqa64 ZMMWORD[832+rsp],zmm3 + vmovdqa64 ZMMWORD[896+rsp],zmm4 + vmovdqa64 ZMMWORD[960+rsp],zmm5 + vmovdqa64 zmm13,ZMMWORD[1280+rsp] + vmovdqu64 zmm12,ZMMWORD[512+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1344+rsp] + vmovdqu64 zmm12,ZMMWORD[576+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1408+rsp] + vmovdqu64 zmm12,ZMMWORD[640+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1472+rsp] + vmovdqu64 zmm12,ZMMWORD[704+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + + sub r13,512 + add r11,512 + mov r10d,r13d + and r10d,~15 + mov ebx,512 + sub ebx,r10d + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_54 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_54 + jb NEAR $L$_last_num_blocks_is_7_1_54 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_54 + jb NEAR $L$_last_num_blocks_is_11_9_54 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_54 + ja NEAR $L$_last_num_blocks_is_16_54 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_54 + jmp NEAR $L$_last_num_blocks_is_13_54 + +$L$_last_num_blocks_is_11_9_54: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_54 + ja NEAR $L$_last_num_blocks_is_11_54 + jmp NEAR $L$_last_num_blocks_is_9_54 + +$L$_last_num_blocks_is_7_1_54: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_54 + jb NEAR $L$_last_num_blocks_is_3_1_54 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_54 + je NEAR $L$_last_num_blocks_is_6_54 + jmp NEAR $L$_last_num_blocks_is_5_54 + +$L$_last_num_blocks_is_3_1_54: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_54 + je NEAR $L$_last_num_blocks_is_2_54 +$L$_last_num_blocks_is_1_54: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_55 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_55 + +$L$_16_blocks_overflow_55: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_55: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm31 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb xmm17,xmm0,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_56 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_56 +$L$_small_initial_partial_block_56: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm0,XMMWORD[POLY2] + + + vpclmulqdq xmm3,xmm0,xmm25,0x01 + vpslldq xmm3,xmm3,8 + vpxorq xmm3,xmm25,xmm3 + + + vpclmulqdq xmm4,xmm0,xmm3,0x00 + vpsrldq xmm4,xmm4,4 + vpclmulqdq xmm14,xmm0,xmm3,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm4,xmm24,0x96 + + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_56 +$L$_small_initial_compute_done_56: +$L$_after_reduction_56: + jmp NEAR $L$_last_blocks_done_54 +$L$_last_num_blocks_is_2_54: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_57 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_57 + +$L$_16_blocks_overflow_57: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_57: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm31 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb ymm17,ymm0,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_58 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_58 +$L$_small_initial_partial_block_58: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_58: + + or r13,r13 + je NEAR $L$_after_reduction_58 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_58: + jmp NEAR $L$_last_blocks_done_54 +$L$_last_num_blocks_is_3_54: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_59 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_59 + +$L$_16_blocks_overflow_59: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_59: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_60 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_60 +$L$_small_initial_partial_block_60: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_60: + + or r13,r13 + je NEAR $L$_after_reduction_60 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_60: + jmp NEAR $L$_last_blocks_done_54 +$L$_last_num_blocks_is_4_54: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_61 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_61 + +$L$_16_blocks_overflow_61: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_61: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_62 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_62 +$L$_small_initial_partial_block_62: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_62: + + or r13,r13 + je NEAR $L$_after_reduction_62 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_62: + jmp NEAR $L$_last_blocks_done_54 +$L$_last_num_blocks_is_5_54: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_63 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_63 + +$L$_16_blocks_overflow_63: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_63: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb xmm19,xmm3,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_64 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_64 +$L$_small_initial_partial_block_64: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_64: + + or r13,r13 + je NEAR $L$_after_reduction_64 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_64: + jmp NEAR $L$_last_blocks_done_54 +$L$_last_num_blocks_is_6_54: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_65 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_65 + +$L$_16_blocks_overflow_65: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_65: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb ymm19,ymm3,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_66 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_66 +$L$_small_initial_partial_block_66: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_66: + + or r13,r13 + je NEAR $L$_after_reduction_66 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_66: + jmp NEAR $L$_last_blocks_done_54 +$L$_last_num_blocks_is_7_54: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_67 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_67 + +$L$_16_blocks_overflow_67: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_67: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_68 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_68 +$L$_small_initial_partial_block_68: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_68: + + or r13,r13 + je NEAR $L$_after_reduction_68 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_68: + jmp NEAR $L$_last_blocks_done_54 +$L$_last_num_blocks_is_8_54: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_69 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_69 + +$L$_16_blocks_overflow_69: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_69: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_70 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_70 +$L$_small_initial_partial_block_70: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_70: + + or r13,r13 + je NEAR $L$_after_reduction_70 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_70: + jmp NEAR $L$_last_blocks_done_54 +$L$_last_num_blocks_is_9_54: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_71 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_71 + +$L$_16_blocks_overflow_71: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_71: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb xmm20,xmm4,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_72 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_72 +$L$_small_initial_partial_block_72: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_72: + + or r13,r13 + je NEAR $L$_after_reduction_72 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_72: + jmp NEAR $L$_last_blocks_done_54 +$L$_last_num_blocks_is_10_54: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_73 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_73 + +$L$_16_blocks_overflow_73: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_73: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb ymm20,ymm4,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_74 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_74 +$L$_small_initial_partial_block_74: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_74: + + or r13,r13 + je NEAR $L$_after_reduction_74 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_74: + jmp NEAR $L$_last_blocks_done_54 +$L$_last_num_blocks_is_11_54: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_75 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_75 + +$L$_16_blocks_overflow_75: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_75: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_76 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_76 +$L$_small_initial_partial_block_76: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_76: + + or r13,r13 + je NEAR $L$_after_reduction_76 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_76: + jmp NEAR $L$_last_blocks_done_54 +$L$_last_num_blocks_is_12_54: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_77 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_77 + +$L$_16_blocks_overflow_77: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_77: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_78 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_78 +$L$_small_initial_partial_block_78: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_78: + + or r13,r13 + je NEAR $L$_after_reduction_78 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_78: + jmp NEAR $L$_last_blocks_done_54 +$L$_last_num_blocks_is_13_54: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_79 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_79 + +$L$_16_blocks_overflow_79: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_79: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb xmm21,xmm5,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_80 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_80 +$L$_small_initial_partial_block_80: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_80: + + or r13,r13 + je NEAR $L$_after_reduction_80 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_80: + jmp NEAR $L$_last_blocks_done_54 +$L$_last_num_blocks_is_14_54: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_81 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_81 + +$L$_16_blocks_overflow_81: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_81: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb ymm21,ymm5,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_82 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_82 +$L$_small_initial_partial_block_82: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_82: + + or r13,r13 + je NEAR $L$_after_reduction_82 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_82: + jmp NEAR $L$_last_blocks_done_54 +$L$_last_num_blocks_is_15_54: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_83 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_83 + +$L$_16_blocks_overflow_83: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_83: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_84 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_84 +$L$_small_initial_partial_block_84: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_84: + + or r13,r13 + je NEAR $L$_after_reduction_84 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_84: + jmp NEAR $L$_last_blocks_done_54 +$L$_last_num_blocks_is_16_54: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_85 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_85 + +$L$_16_blocks_overflow_85: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_85: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_86: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_86: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_86: + jmp NEAR $L$_last_blocks_done_54 +$L$_last_num_blocks_is_0_54: + vmovdqa64 zmm13,ZMMWORD[768+rsp] + vpxorq zmm13,zmm13,zmm14 + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[832+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpxorq zmm26,zmm4,zmm10 + vpxorq zmm24,zmm0,zmm6 + vpxorq zmm25,zmm3,zmm7 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[896+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[960+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_54: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_10 +$L$_encrypt_16_blocks_10: + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_87 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_87 +$L$_16_blocks_overflow_87: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_87: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[192+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm26,zmm10,zmm15 + vpxorq zmm24,zmm6,zmm12 + vpxorq zmm25,zmm7,zmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqa64 ZMMWORD[1280+rsp],zmm0 + vmovdqa64 ZMMWORD[1344+rsp],zmm3 + vmovdqa64 ZMMWORD[1408+rsp],zmm4 + vmovdqa64 ZMMWORD[1472+rsp],zmm5 + vmovdqa64 zmm13,ZMMWORD[1024+rsp] + vmovdqu64 zmm12,ZMMWORD[256+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1088+rsp] + vmovdqu64 zmm12,ZMMWORD[320+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1152+rsp] + vmovdqu64 zmm12,ZMMWORD[384+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1216+rsp] + vmovdqu64 zmm12,ZMMWORD[448+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + sub r13,256 + add r11,256 + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_88 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_88 + jb NEAR $L$_last_num_blocks_is_7_1_88 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_88 + jb NEAR $L$_last_num_blocks_is_11_9_88 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_88 + ja NEAR $L$_last_num_blocks_is_16_88 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_88 + jmp NEAR $L$_last_num_blocks_is_13_88 + +$L$_last_num_blocks_is_11_9_88: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_88 + ja NEAR $L$_last_num_blocks_is_11_88 + jmp NEAR $L$_last_num_blocks_is_9_88 + +$L$_last_num_blocks_is_7_1_88: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_88 + jb NEAR $L$_last_num_blocks_is_3_1_88 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_88 + je NEAR $L$_last_num_blocks_is_6_88 + jmp NEAR $L$_last_num_blocks_is_5_88 + +$L$_last_num_blocks_is_3_1_88: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_88 + je NEAR $L$_last_num_blocks_is_2_88 +$L$_last_num_blocks_is_1_88: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_89 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_89 + +$L$_16_blocks_overflow_89: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_89: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc xmm0,xmm0,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb xmm17,xmm0,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_90 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_90 +$L$_small_initial_partial_block_90: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_90 +$L$_small_initial_compute_done_90: +$L$_after_reduction_90: + jmp NEAR $L$_last_blocks_done_88 +$L$_last_num_blocks_is_2_88: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_91 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_91 + +$L$_16_blocks_overflow_91: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_91: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc ymm0,ymm0,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb ymm17,ymm0,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_92 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_92 +$L$_small_initial_partial_block_92: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_92: + + or r13,r13 + je NEAR $L$_after_reduction_92 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_92: + jmp NEAR $L$_last_blocks_done_88 +$L$_last_num_blocks_is_3_88: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_93 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_93 + +$L$_16_blocks_overflow_93: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_93: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_94 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_94 +$L$_small_initial_partial_block_94: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_94: + + or r13,r13 + je NEAR $L$_after_reduction_94 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_94: + jmp NEAR $L$_last_blocks_done_88 +$L$_last_num_blocks_is_4_88: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_95 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_95 + +$L$_16_blocks_overflow_95: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_95: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_96 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_96 +$L$_small_initial_partial_block_96: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_96: + + or r13,r13 + je NEAR $L$_after_reduction_96 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_96: + jmp NEAR $L$_last_blocks_done_88 +$L$_last_num_blocks_is_5_88: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_97 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_97 + +$L$_16_blocks_overflow_97: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_97: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb xmm19,xmm3,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_98 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_98 +$L$_small_initial_partial_block_98: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_98: + + or r13,r13 + je NEAR $L$_after_reduction_98 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_98: + jmp NEAR $L$_last_blocks_done_88 +$L$_last_num_blocks_is_6_88: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_99 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_99 + +$L$_16_blocks_overflow_99: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_99: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb ymm19,ymm3,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_100 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_100 +$L$_small_initial_partial_block_100: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_100: + + or r13,r13 + je NEAR $L$_after_reduction_100 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_100: + jmp NEAR $L$_last_blocks_done_88 +$L$_last_num_blocks_is_7_88: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_101 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_101 + +$L$_16_blocks_overflow_101: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_101: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_102 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_102 +$L$_small_initial_partial_block_102: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_102: + + or r13,r13 + je NEAR $L$_after_reduction_102 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_102: + jmp NEAR $L$_last_blocks_done_88 +$L$_last_num_blocks_is_8_88: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_103 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_103 + +$L$_16_blocks_overflow_103: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_103: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_104 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_104 +$L$_small_initial_partial_block_104: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_104: + + or r13,r13 + je NEAR $L$_after_reduction_104 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_104: + jmp NEAR $L$_last_blocks_done_88 +$L$_last_num_blocks_is_9_88: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_105 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_105 + +$L$_16_blocks_overflow_105: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_105: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb xmm20,xmm4,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_106 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_106 +$L$_small_initial_partial_block_106: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_106: + + or r13,r13 + je NEAR $L$_after_reduction_106 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_106: + jmp NEAR $L$_last_blocks_done_88 +$L$_last_num_blocks_is_10_88: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_107 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_107 + +$L$_16_blocks_overflow_107: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_107: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb ymm20,ymm4,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_108 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_108 +$L$_small_initial_partial_block_108: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_108: + + or r13,r13 + je NEAR $L$_after_reduction_108 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_108: + jmp NEAR $L$_last_blocks_done_88 +$L$_last_num_blocks_is_11_88: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_109 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_109 + +$L$_16_blocks_overflow_109: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_109: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_110 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_110 +$L$_small_initial_partial_block_110: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_110: + + or r13,r13 + je NEAR $L$_after_reduction_110 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_110: + jmp NEAR $L$_last_blocks_done_88 +$L$_last_num_blocks_is_12_88: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_111 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_111 + +$L$_16_blocks_overflow_111: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_111: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_112 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_112 +$L$_small_initial_partial_block_112: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_112: + + or r13,r13 + je NEAR $L$_after_reduction_112 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_112: + jmp NEAR $L$_last_blocks_done_88 +$L$_last_num_blocks_is_13_88: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_113 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_113 + +$L$_16_blocks_overflow_113: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_113: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb xmm21,xmm5,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_114 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_114 +$L$_small_initial_partial_block_114: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_114: + + or r13,r13 + je NEAR $L$_after_reduction_114 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_114: + jmp NEAR $L$_last_blocks_done_88 +$L$_last_num_blocks_is_14_88: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_115 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_115 + +$L$_16_blocks_overflow_115: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_115: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb ymm21,ymm5,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_116 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_116 +$L$_small_initial_partial_block_116: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_116: + + or r13,r13 + je NEAR $L$_after_reduction_116 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_116: + jmp NEAR $L$_last_blocks_done_88 +$L$_last_num_blocks_is_15_88: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_117 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_117 + +$L$_16_blocks_overflow_117: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_117: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_118 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_118 +$L$_small_initial_partial_block_118: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_118: + + or r13,r13 + je NEAR $L$_after_reduction_118 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_118: + jmp NEAR $L$_last_blocks_done_88 +$L$_last_num_blocks_is_16_88: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_119 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_119 + +$L$_16_blocks_overflow_119: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_119: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_120: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_120: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_120: + jmp NEAR $L$_last_blocks_done_88 +$L$_last_num_blocks_is_0_88: + vmovdqa64 zmm13,ZMMWORD[1280+rsp] + vmovdqu64 zmm12,ZMMWORD[512+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1344+rsp] + vmovdqu64 zmm12,ZMMWORD[576+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1408+rsp] + vmovdqu64 zmm12,ZMMWORD[640+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1472+rsp] + vmovdqu64 zmm12,ZMMWORD[704+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_88: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_10 + +$L$_message_below_32_blocks_10: + + + sub r13,256 + add r11,256 + mov r10d,r13d + test r14,r14 + jnz NEAR $L$_skip_hkeys_precomputation_121 + vmovdqu64 zmm3,ZMMWORD[640+rsp] + + + vshufi64x2 zmm3,zmm3,zmm3,0x00 + + vmovdqu64 zmm4,ZMMWORD[576+rsp] + vmovdqu64 zmm5,ZMMWORD[512+rsp] + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[448+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[384+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[320+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[256+rsp],zmm5 +$L$_skip_hkeys_precomputation_121: + mov r14,1 + and r10d,~15 + mov ebx,512 + sub ebx,r10d + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_122 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_122 + jb NEAR $L$_last_num_blocks_is_7_1_122 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_122 + jb NEAR $L$_last_num_blocks_is_11_9_122 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_122 + ja NEAR $L$_last_num_blocks_is_16_122 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_122 + jmp NEAR $L$_last_num_blocks_is_13_122 + +$L$_last_num_blocks_is_11_9_122: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_122 + ja NEAR $L$_last_num_blocks_is_11_122 + jmp NEAR $L$_last_num_blocks_is_9_122 + +$L$_last_num_blocks_is_7_1_122: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_122 + jb NEAR $L$_last_num_blocks_is_3_1_122 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_122 + je NEAR $L$_last_num_blocks_is_6_122 + jmp NEAR $L$_last_num_blocks_is_5_122 + +$L$_last_num_blocks_is_3_1_122: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_122 + je NEAR $L$_last_num_blocks_is_2_122 +$L$_last_num_blocks_is_1_122: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_123 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_123 + +$L$_16_blocks_overflow_123: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_123: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm31 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb xmm17,xmm0,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_124 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_124 +$L$_small_initial_partial_block_124: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm0,XMMWORD[POLY2] + + + vpclmulqdq xmm3,xmm0,xmm25,0x01 + vpslldq xmm3,xmm3,8 + vpxorq xmm3,xmm25,xmm3 + + + vpclmulqdq xmm4,xmm0,xmm3,0x00 + vpsrldq xmm4,xmm4,4 + vpclmulqdq xmm14,xmm0,xmm3,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm4,xmm24,0x96 + + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_124 +$L$_small_initial_compute_done_124: +$L$_after_reduction_124: + jmp NEAR $L$_last_blocks_done_122 +$L$_last_num_blocks_is_2_122: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_125 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_125 + +$L$_16_blocks_overflow_125: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_125: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm31 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb ymm17,ymm0,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_126 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_126 +$L$_small_initial_partial_block_126: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_126: + + or r13,r13 + je NEAR $L$_after_reduction_126 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_126: + jmp NEAR $L$_last_blocks_done_122 +$L$_last_num_blocks_is_3_122: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_127 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_127 + +$L$_16_blocks_overflow_127: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_127: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_128 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_128 +$L$_small_initial_partial_block_128: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_128: + + or r13,r13 + je NEAR $L$_after_reduction_128 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_128: + jmp NEAR $L$_last_blocks_done_122 +$L$_last_num_blocks_is_4_122: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_129 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_129 + +$L$_16_blocks_overflow_129: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_129: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_130 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_130 +$L$_small_initial_partial_block_130: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_130: + + or r13,r13 + je NEAR $L$_after_reduction_130 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_130: + jmp NEAR $L$_last_blocks_done_122 +$L$_last_num_blocks_is_5_122: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_131 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_131 + +$L$_16_blocks_overflow_131: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_131: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb xmm19,xmm3,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_132 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_132 +$L$_small_initial_partial_block_132: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_132: + + or r13,r13 + je NEAR $L$_after_reduction_132 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_132: + jmp NEAR $L$_last_blocks_done_122 +$L$_last_num_blocks_is_6_122: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_133 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_133 + +$L$_16_blocks_overflow_133: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_133: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb ymm19,ymm3,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_134 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_134 +$L$_small_initial_partial_block_134: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_134: + + or r13,r13 + je NEAR $L$_after_reduction_134 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_134: + jmp NEAR $L$_last_blocks_done_122 +$L$_last_num_blocks_is_7_122: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_135 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_135 + +$L$_16_blocks_overflow_135: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_135: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_136 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_136 +$L$_small_initial_partial_block_136: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_136: + + or r13,r13 + je NEAR $L$_after_reduction_136 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_136: + jmp NEAR $L$_last_blocks_done_122 +$L$_last_num_blocks_is_8_122: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_137 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_137 + +$L$_16_blocks_overflow_137: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_137: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_138 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_138 +$L$_small_initial_partial_block_138: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_138: + + or r13,r13 + je NEAR $L$_after_reduction_138 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_138: + jmp NEAR $L$_last_blocks_done_122 +$L$_last_num_blocks_is_9_122: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_139 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_139 + +$L$_16_blocks_overflow_139: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_139: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb xmm20,xmm4,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_140 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_140 +$L$_small_initial_partial_block_140: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_140: + + or r13,r13 + je NEAR $L$_after_reduction_140 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_140: + jmp NEAR $L$_last_blocks_done_122 +$L$_last_num_blocks_is_10_122: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_141 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_141 + +$L$_16_blocks_overflow_141: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_141: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb ymm20,ymm4,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_142 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_142 +$L$_small_initial_partial_block_142: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_142: + + or r13,r13 + je NEAR $L$_after_reduction_142 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_142: + jmp NEAR $L$_last_blocks_done_122 +$L$_last_num_blocks_is_11_122: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_143 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_143 + +$L$_16_blocks_overflow_143: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_143: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_144 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_144 +$L$_small_initial_partial_block_144: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_144: + + or r13,r13 + je NEAR $L$_after_reduction_144 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_144: + jmp NEAR $L$_last_blocks_done_122 +$L$_last_num_blocks_is_12_122: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_145 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_145 + +$L$_16_blocks_overflow_145: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_145: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_146 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_146 +$L$_small_initial_partial_block_146: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_146: + + or r13,r13 + je NEAR $L$_after_reduction_146 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_146: + jmp NEAR $L$_last_blocks_done_122 +$L$_last_num_blocks_is_13_122: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_147 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_147 + +$L$_16_blocks_overflow_147: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_147: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb xmm21,xmm5,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_148 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_148 +$L$_small_initial_partial_block_148: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_148: + + or r13,r13 + je NEAR $L$_after_reduction_148 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_148: + jmp NEAR $L$_last_blocks_done_122 +$L$_last_num_blocks_is_14_122: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_149 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_149 + +$L$_16_blocks_overflow_149: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_149: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb ymm21,ymm5,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_150 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_150 +$L$_small_initial_partial_block_150: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_150: + + or r13,r13 + je NEAR $L$_after_reduction_150 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_150: + jmp NEAR $L$_last_blocks_done_122 +$L$_last_num_blocks_is_15_122: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_151 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_151 + +$L$_16_blocks_overflow_151: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_151: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_152 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_152 +$L$_small_initial_partial_block_152: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_152: + + or r13,r13 + je NEAR $L$_after_reduction_152 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_152: + jmp NEAR $L$_last_blocks_done_122 +$L$_last_num_blocks_is_16_122: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_153 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_153 + +$L$_16_blocks_overflow_153: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_153: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_154: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_154: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_154: + jmp NEAR $L$_last_blocks_done_122 +$L$_last_num_blocks_is_0_122: + vmovdqa64 zmm13,ZMMWORD[768+rsp] + vpxorq zmm13,zmm13,zmm14 + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[832+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpxorq zmm26,zmm4,zmm10 + vpxorq zmm24,zmm0,zmm6 + vpxorq zmm25,zmm3,zmm7 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[896+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[960+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_122: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_10 + +$L$_message_below_equal_16_blocks_10: + + + mov r12d,r13d + add r12d,15 + shr r12d,4 + cmp r12,8 + je NEAR $L$_small_initial_num_blocks_is_8_155 + jl NEAR $L$_small_initial_num_blocks_is_7_1_155 + + + cmp r12,12 + je NEAR $L$_small_initial_num_blocks_is_12_155 + jl NEAR $L$_small_initial_num_blocks_is_11_9_155 + + + cmp r12,16 + je NEAR $L$_small_initial_num_blocks_is_16_155 + cmp r12,15 + je NEAR $L$_small_initial_num_blocks_is_15_155 + cmp r12,14 + je NEAR $L$_small_initial_num_blocks_is_14_155 + jmp NEAR $L$_small_initial_num_blocks_is_13_155 + +$L$_small_initial_num_blocks_is_11_9_155: + + cmp r12,11 + je NEAR $L$_small_initial_num_blocks_is_11_155 + cmp r12,10 + je NEAR $L$_small_initial_num_blocks_is_10_155 + jmp NEAR $L$_small_initial_num_blocks_is_9_155 + +$L$_small_initial_num_blocks_is_7_1_155: + cmp r12,4 + je NEAR $L$_small_initial_num_blocks_is_4_155 + jl NEAR $L$_small_initial_num_blocks_is_3_1_155 + + cmp r12,7 + je NEAR $L$_small_initial_num_blocks_is_7_155 + cmp r12,6 + je NEAR $L$_small_initial_num_blocks_is_6_155 + jmp NEAR $L$_small_initial_num_blocks_is_5_155 + +$L$_small_initial_num_blocks_is_3_1_155: + + cmp r12,3 + je NEAR $L$_small_initial_num_blocks_is_3_155 + cmp r12,2 + je NEAR $L$_small_initial_num_blocks_is_2_155 + + + + + +$L$_small_initial_num_blocks_is_1_155: + vmovdqa64 xmm29,XMMWORD[SHUF_MASK] + vpaddd xmm0,xmm2,XMMWORD[ONE] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,0 + vpshufb xmm0,xmm0,xmm29 + vmovdqu8 xmm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast xmm0,xmm0,xmm15 + vpxorq xmm0,xmm0,xmm6 + vextracti32x4 xmm12,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb xmm6,xmm0,xmm29 + vextracti32x4 xmm13,zmm6,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_156 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm6,xmm20,0x01 + vpclmulqdq xmm5,xmm6,xmm20,0x10 + vpclmulqdq xmm0,xmm6,xmm20,0x11 + vpclmulqdq xmm3,xmm6,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_156 +$L$_small_initial_partial_block_156: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm13 + + jmp NEAR $L$_after_reduction_156 +$L$_small_initial_compute_done_156: +$L$_after_reduction_156: + jmp NEAR $L$_small_initial_blocks_encrypted_155 +$L$_small_initial_num_blocks_is_2_155: + vmovdqa64 ymm29,YMMWORD[SHUF_MASK] + vshufi64x2 ymm0,ymm2,ymm2,0 + vpaddd ymm0,ymm0,YMMWORD[ddq_add_1234] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,1 + vpshufb ymm0,ymm0,ymm29 + vmovdqu8 ymm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast ymm0,ymm0,ymm15 + vpxorq ymm0,ymm0,ymm6 + vextracti32x4 xmm12,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb ymm6,ymm0,ymm29 + vextracti32x4 xmm13,zmm6,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_157 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm6,ymm20,0x01 + vpclmulqdq ymm5,ymm6,ymm20,0x10 + vpclmulqdq ymm0,ymm6,ymm20,0x11 + vpclmulqdq ymm3,ymm6,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_157 +$L$_small_initial_partial_block_157: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm6,xmm20,0x01 + vpclmulqdq xmm5,xmm6,xmm20,0x10 + vpclmulqdq xmm0,xmm6,xmm20,0x11 + vpclmulqdq xmm3,xmm6,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_157: + + or r13,r13 + je NEAR $L$_after_reduction_157 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_157: + jmp NEAR $L$_small_initial_blocks_encrypted_155 +$L$_small_initial_num_blocks_is_3_155: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,2 + vpshufb zmm0,zmm0,zmm29 + vmovdqu8 zmm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vpxorq zmm0,zmm0,zmm6 + vextracti32x4 xmm12,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm6,zmm0,zmm29 + vextracti32x4 xmm13,zmm6,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_158 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_158 +$L$_small_initial_partial_block_158: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm6,ymm20,0x01 + vpclmulqdq ymm5,ymm6,ymm20,0x10 + vpclmulqdq ymm0,ymm6,ymm20,0x11 + vpclmulqdq ymm3,ymm6,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_158: + + or r13,r13 + je NEAR $L$_after_reduction_158 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_158: + jmp NEAR $L$_small_initial_blocks_encrypted_155 +$L$_small_initial_num_blocks_is_4_155: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,3 + vpshufb zmm0,zmm0,zmm29 + vmovdqu8 zmm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vpxorq zmm0,zmm0,zmm6 + vextracti32x4 xmm12,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm6,zmm0,zmm29 + vextracti32x4 xmm13,zmm6,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_159 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_159 +$L$_small_initial_partial_block_159: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_159: + + or r13,r13 + je NEAR $L$_after_reduction_159 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_159: + jmp NEAR $L$_small_initial_blocks_encrypted_155 +$L$_small_initial_num_blocks_is_5_155: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,0 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 xmm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast xmm3,xmm3,xmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq xmm3,xmm3,xmm7 + vextracti32x4 xmm12,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm0,zmm29 + vpshufb xmm7,xmm3,xmm29 + vextracti32x4 xmm13,zmm7,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_160 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm7,xmm20,0x01 + vpclmulqdq xmm5,xmm7,xmm20,0x10 + vpclmulqdq xmm0,xmm7,xmm20,0x11 + vpclmulqdq xmm3,xmm7,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_160 +$L$_small_initial_partial_block_160: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_160: + + or r13,r13 + je NEAR $L$_after_reduction_160 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_160: + jmp NEAR $L$_small_initial_blocks_encrypted_155 +$L$_small_initial_num_blocks_is_6_155: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,1 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 ymm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast ymm3,ymm3,ymm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq ymm3,ymm3,ymm7 + vextracti32x4 xmm12,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm0,zmm29 + vpshufb ymm7,ymm3,ymm29 + vextracti32x4 xmm13,zmm7,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_161 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm7,ymm20,0x01 + vpclmulqdq ymm5,ymm7,ymm20,0x10 + vpclmulqdq ymm0,ymm7,ymm20,0x11 + vpclmulqdq ymm3,ymm7,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_161 +$L$_small_initial_partial_block_161: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm7,xmm20,0x01 + vpclmulqdq xmm5,xmm7,xmm20,0x10 + vpclmulqdq xmm0,xmm7,xmm20,0x11 + vpclmulqdq xmm3,xmm7,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_161: + + or r13,r13 + je NEAR $L$_after_reduction_161 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_161: + jmp NEAR $L$_small_initial_blocks_encrypted_155 +$L$_small_initial_num_blocks_is_7_155: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,2 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vextracti32x4 xmm12,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vextracti32x4 xmm13,zmm7,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_162 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm7,zmm20,0x01 + vpclmulqdq zmm5,zmm7,zmm20,0x10 + vpclmulqdq zmm0,zmm7,zmm20,0x11 + vpclmulqdq zmm3,zmm7,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_162 +$L$_small_initial_partial_block_162: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm7,ymm20,0x01 + vpclmulqdq ymm5,ymm7,ymm20,0x10 + vpclmulqdq ymm0,ymm7,ymm20,0x11 + vpclmulqdq ymm3,ymm7,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_162: + + or r13,r13 + je NEAR $L$_after_reduction_162 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_162: + jmp NEAR $L$_small_initial_blocks_encrypted_155 +$L$_small_initial_num_blocks_is_8_155: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,3 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vextracti32x4 xmm12,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vextracti32x4 xmm13,zmm7,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_163 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_163 +$L$_small_initial_partial_block_163: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm7,zmm20,0x01 + vpclmulqdq zmm5,zmm7,zmm20,0x10 + vpclmulqdq zmm0,zmm7,zmm20,0x11 + vpclmulqdq zmm3,zmm7,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_163: + + or r13,r13 + je NEAR $L$_after_reduction_163 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_163: + jmp NEAR $L$_small_initial_blocks_encrypted_155 +$L$_small_initial_num_blocks_is_9_155: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,0 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast xmm4,xmm4,xmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq xmm4,xmm4,xmm10 + vextracti32x4 xmm12,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb xmm10,xmm4,xmm29 + vextracti32x4 xmm13,zmm10,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_164 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm10,xmm20,0x01 + vpclmulqdq xmm5,xmm10,xmm20,0x10 + vpclmulqdq xmm0,xmm10,xmm20,0x11 + vpclmulqdq xmm3,xmm10,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_164 +$L$_small_initial_partial_block_164: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_164: + + or r13,r13 + je NEAR $L$_after_reduction_164 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_164: + jmp NEAR $L$_small_initial_blocks_encrypted_155 +$L$_small_initial_num_blocks_is_10_155: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,1 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast ymm4,ymm4,ymm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq ymm4,ymm4,ymm10 + vextracti32x4 xmm12,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb ymm10,ymm4,ymm29 + vextracti32x4 xmm13,zmm10,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_165 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm10,ymm20,0x01 + vpclmulqdq ymm5,ymm10,ymm20,0x10 + vpclmulqdq ymm0,ymm10,ymm20,0x11 + vpclmulqdq ymm3,ymm10,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_165 +$L$_small_initial_partial_block_165: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm10,xmm20,0x01 + vpclmulqdq xmm5,xmm10,xmm20,0x10 + vpclmulqdq xmm0,xmm10,xmm20,0x11 + vpclmulqdq xmm3,xmm10,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_165: + + or r13,r13 + je NEAR $L$_after_reduction_165 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_165: + jmp NEAR $L$_small_initial_blocks_encrypted_155 +$L$_small_initial_num_blocks_is_11_155: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,2 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vextracti32x4 xmm12,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb zmm10,zmm4,zmm29 + vextracti32x4 xmm13,zmm10,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_166 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm10,zmm20,0x01 + vpclmulqdq zmm5,zmm10,zmm20,0x10 + vpclmulqdq zmm0,zmm10,zmm20,0x11 + vpclmulqdq zmm3,zmm10,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_166 +$L$_small_initial_partial_block_166: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm10,ymm20,0x01 + vpclmulqdq ymm5,ymm10,ymm20,0x10 + vpclmulqdq ymm0,ymm10,ymm20,0x11 + vpclmulqdq ymm3,ymm10,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_166: + + or r13,r13 + je NEAR $L$_after_reduction_166 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_166: + jmp NEAR $L$_small_initial_blocks_encrypted_155 +$L$_small_initial_num_blocks_is_12_155: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,3 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vextracti32x4 xmm12,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb zmm10,zmm4,zmm29 + vextracti32x4 xmm13,zmm10,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_167 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_167 +$L$_small_initial_partial_block_167: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm10,zmm20,0x01 + vpclmulqdq zmm5,zmm10,zmm20,0x10 + vpclmulqdq zmm0,zmm10,zmm20,0x11 + vpclmulqdq zmm3,zmm10,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_167: + + or r13,r13 + je NEAR $L$_after_reduction_167 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_167: + jmp NEAR $L$_small_initial_blocks_encrypted_155 +$L$_small_initial_num_blocks_is_13_155: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,0 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast xmm5,xmm5,xmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq xmm5,xmm5,xmm11 + vextracti32x4 xmm12,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb zmm10,zmm4,zmm29 + vpshufb xmm11,xmm5,xmm29 + vextracti32x4 xmm13,zmm11,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_168 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm11,xmm20,0x01 + vpclmulqdq xmm5,xmm11,xmm20,0x10 + vpclmulqdq xmm0,xmm11,xmm20,0x11 + vpclmulqdq xmm3,xmm11,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_168 +$L$_small_initial_partial_block_168: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_168: + + or r13,r13 + je NEAR $L$_after_reduction_168 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_168: + jmp NEAR $L$_small_initial_blocks_encrypted_155 +$L$_small_initial_num_blocks_is_14_155: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,1 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast ymm5,ymm5,ymm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq ymm5,ymm5,ymm11 + vextracti32x4 xmm12,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb zmm10,zmm4,zmm29 + vpshufb ymm11,ymm5,ymm29 + vextracti32x4 xmm13,zmm11,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_169 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm11,ymm20,0x01 + vpclmulqdq ymm5,ymm11,ymm20,0x10 + vpclmulqdq ymm0,ymm11,ymm20,0x11 + vpclmulqdq ymm3,ymm11,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_169 +$L$_small_initial_partial_block_169: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm11,xmm20,0x01 + vpclmulqdq xmm5,xmm11,xmm20,0x10 + vpclmulqdq xmm0,xmm11,xmm20,0x11 + vpclmulqdq xmm3,xmm11,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_169: + + or r13,r13 + je NEAR $L$_after_reduction_169 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_169: + jmp NEAR $L$_small_initial_blocks_encrypted_155 +$L$_small_initial_num_blocks_is_15_155: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,2 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast zmm5,zmm5,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq zmm5,zmm5,zmm11 + vextracti32x4 xmm12,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb zmm10,zmm4,zmm29 + vpshufb zmm11,zmm5,zmm29 + vextracti32x4 xmm13,zmm11,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_170 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm11,zmm20,0x01 + vpclmulqdq zmm5,zmm11,zmm20,0x10 + vpclmulqdq zmm0,zmm11,zmm20,0x11 + vpclmulqdq zmm3,zmm11,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_170 +$L$_small_initial_partial_block_170: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm11,ymm20,0x01 + vpclmulqdq ymm5,ymm11,ymm20,0x10 + vpclmulqdq ymm0,ymm11,ymm20,0x11 + vpclmulqdq ymm3,ymm11,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_170: + + or r13,r13 + je NEAR $L$_after_reduction_170 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_170: + jmp NEAR $L$_small_initial_blocks_encrypted_155 +$L$_small_initial_num_blocks_is_16_155: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,3 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast zmm5,zmm5,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq zmm5,zmm5,zmm11 + vextracti32x4 xmm12,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb zmm10,zmm4,zmm29 + vpshufb zmm11,zmm5,zmm29 + vextracti32x4 xmm13,zmm11,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_171: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm11,zmm20,0x01 + vpclmulqdq zmm5,zmm11,zmm20,0x10 + vpclmulqdq zmm0,zmm11,zmm20,0x11 + vpclmulqdq zmm3,zmm11,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_171: + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_171: +$L$_small_initial_blocks_encrypted_155: +$L$_ghash_done_10: + vmovdqu64 XMMWORD[rdx],xmm2 + vmovdqu64 XMMWORD[64+rdx],xmm14 +$L$_enc_dec_done_10: + jmp NEAR $L$exit_gcm_encrypt +ALIGN 32 +$L$aes_gcm_encrypt_192_avx512: + cmp QWORD[112+rbp],0 + je NEAR $L$_enc_dec_done_172 + xor r14,r14 + vmovdqu64 xmm14,XMMWORD[64+rdx] + + mov r11,QWORD[r8] + or r11,r11 + je NEAR $L$_partial_block_done_173 + mov r10d,16 + lea r12,[byte_len_to_mask_table] + cmp QWORD[112+rbp],r10 + cmovc r10,QWORD[112+rbp] + add r12,r10 + add r12,r10 + kmovw k1,[r12] + vmovdqu8 xmm0{k1}{z},[r9] + + vmovdqu64 xmm3,XMMWORD[16+rdx] + vmovdqu64 xmm4,XMMWORD[336+rdx] + + + + lea r12,[SHIFT_MASK] + add r12,r11 + vmovdqu64 xmm5,XMMWORD[r12] + vpshufb xmm3,xmm3,xmm5 + vpxorq xmm3,xmm3,xmm0 + + + mov r13,QWORD[112+rbp] + add r13,r11 + sub r13,16 + jge NEAR $L$_no_extra_mask_173 + sub r12,r13 +$L$_no_extra_mask_173: + + + + vmovdqu64 xmm0,XMMWORD[16+r12] + vpand xmm3,xmm3,xmm0 + vpshufb xmm3,xmm3,XMMWORD[SHUF_MASK] + vpshufb xmm3,xmm3,xmm5 + vpxorq xmm14,xmm14,xmm3 + cmp r13,0 + jl NEAR $L$_partial_incomplete_173 + + vpclmulqdq xmm7,xmm14,xmm4,0x11 + vpclmulqdq xmm10,xmm14,xmm4,0x00 + vpclmulqdq xmm11,xmm14,xmm4,0x01 + vpclmulqdq xmm14,xmm14,xmm4,0x10 + vpxorq xmm14,xmm14,xmm11 + + vpsrldq xmm11,xmm14,8 + vpslldq xmm14,xmm14,8 + vpxorq xmm7,xmm7,xmm11 + vpxorq xmm14,xmm14,xmm10 + + + + vmovdqu64 xmm11,XMMWORD[POLY2] + + vpclmulqdq xmm10,xmm11,xmm14,0x01 + vpslldq xmm10,xmm10,8 + vpxorq xmm14,xmm14,xmm10 + + + + vpclmulqdq xmm10,xmm11,xmm14,0x00 + vpsrldq xmm10,xmm10,4 + vpclmulqdq xmm14,xmm11,xmm14,0x10 + vpslldq xmm14,xmm14,4 + + vpternlogq xmm14,xmm7,xmm10,0x96 + + mov QWORD[r8],0 + + mov r12,r11 + mov r11,16 + sub r11,r12 + jmp NEAR $L$_enc_dec_done_173 + +$L$_partial_incomplete_173: + mov r12,QWORD[112+rbp] + add QWORD[r8],r12 + mov r11,QWORD[112+rbp] + +$L$_enc_dec_done_173: + + + lea r12,[byte_len_to_mask_table] + kmovw k1,[r11*2+r12] + vmovdqu64 XMMWORD[64+rdx],xmm14 + + vpshufb xmm3,xmm3,XMMWORD[SHUF_MASK] + vpshufb xmm3,xmm3,xmm5 + mov r12,QWORD[120+rbp] + vmovdqu8 XMMWORD[r12]{k1},xmm3 +$L$_partial_block_done_173: + vmovdqu64 xmm2,XMMWORD[rdx] + mov r13,QWORD[112+rbp] + sub r13,r11 + je NEAR $L$_enc_dec_done_172 + cmp r13,256 + jbe NEAR $L$_message_below_equal_16_blocks_172 + + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vmovdqa64 zmm27,ZMMWORD[ddq_addbe_4444] + vmovdqa64 zmm28,ZMMWORD[ddq_addbe_1234] + + + + + + + vmovd r15d,xmm2 + and r15d,255 + + vshufi64x2 zmm2,zmm2,zmm2,0 + vpshufb zmm2,zmm2,zmm29 + + + + cmp r15b,240 + jae NEAR $L$_next_16_overflow_174 + vpaddd zmm7,zmm2,zmm28 + vpaddd zmm10,zmm7,zmm27 + vpaddd zmm11,zmm10,zmm27 + vpaddd zmm12,zmm11,zmm27 + jmp NEAR $L$_next_16_ok_174 +$L$_next_16_overflow_174: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm12,ZMMWORD[ddq_add_4444] + vpaddd zmm7,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm10,zmm7,zmm12 + vpaddd zmm11,zmm10,zmm12 + vpaddd zmm12,zmm11,zmm12 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vpshufb zmm12,zmm12,zmm29 +$L$_next_16_ok_174: + vshufi64x2 zmm2,zmm12,zmm12,255 + add r15b,16 + + vmovdqu8 zmm0,ZMMWORD[r11*1+r9] + vmovdqu8 zmm3,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm4,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm5,ZMMWORD[192+r11*1+r9] + + + vbroadcastf64x2 zmm6,ZMMWORD[rcx] + vpxorq zmm7,zmm7,zmm6 + vpxorq zmm10,zmm10,zmm6 + vpxorq zmm11,zmm11,zmm6 + vpxorq zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[16+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[32+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[48+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[64+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[80+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[96+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[112+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[128+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[144+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[160+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[176+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[192+rcx] + vaesenclast zmm7,zmm7,zmm6 + vaesenclast zmm10,zmm10,zmm6 + vaesenclast zmm11,zmm11,zmm6 + vaesenclast zmm12,zmm12,zmm6 + + + vpxorq zmm7,zmm7,zmm0 + vpxorq zmm10,zmm10,zmm3 + vpxorq zmm11,zmm11,zmm4 + vpxorq zmm12,zmm12,zmm5 + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm7 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm10 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm11 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm12 + + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vpshufb zmm12,zmm12,zmm29 + vmovdqa64 ZMMWORD[768+rsp],zmm7 + vmovdqa64 ZMMWORD[832+rsp],zmm10 + vmovdqa64 ZMMWORD[896+rsp],zmm11 + vmovdqa64 ZMMWORD[960+rsp],zmm12 + test r14,r14 + jnz NEAR $L$_skip_hkeys_precomputation_175 + + vmovdqu64 zmm0,ZMMWORD[288+rdx] + vmovdqu64 ZMMWORD[704+rsp],zmm0 + + vmovdqu64 zmm3,ZMMWORD[224+rdx] + vmovdqu64 ZMMWORD[640+rsp],zmm3 + + + vshufi64x2 zmm3,zmm3,zmm3,0x00 + + vmovdqu64 zmm4,ZMMWORD[160+rdx] + vmovdqu64 ZMMWORD[576+rsp],zmm4 + + vmovdqu64 zmm5,ZMMWORD[96+rdx] + vmovdqu64 ZMMWORD[512+rsp],zmm5 +$L$_skip_hkeys_precomputation_175: + cmp r13,512 + jb NEAR $L$_message_below_32_blocks_172 + + + + cmp r15b,240 + jae NEAR $L$_next_16_overflow_176 + vpaddd zmm7,zmm2,zmm28 + vpaddd zmm10,zmm7,zmm27 + vpaddd zmm11,zmm10,zmm27 + vpaddd zmm12,zmm11,zmm27 + jmp NEAR $L$_next_16_ok_176 +$L$_next_16_overflow_176: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm12,ZMMWORD[ddq_add_4444] + vpaddd zmm7,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm10,zmm7,zmm12 + vpaddd zmm11,zmm10,zmm12 + vpaddd zmm12,zmm11,zmm12 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vpshufb zmm12,zmm12,zmm29 +$L$_next_16_ok_176: + vshufi64x2 zmm2,zmm12,zmm12,255 + add r15b,16 + + vmovdqu8 zmm0,ZMMWORD[256+r11*1+r9] + vmovdqu8 zmm3,ZMMWORD[320+r11*1+r9] + vmovdqu8 zmm4,ZMMWORD[384+r11*1+r9] + vmovdqu8 zmm5,ZMMWORD[448+r11*1+r9] + + + vbroadcastf64x2 zmm6,ZMMWORD[rcx] + vpxorq zmm7,zmm7,zmm6 + vpxorq zmm10,zmm10,zmm6 + vpxorq zmm11,zmm11,zmm6 + vpxorq zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[16+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[32+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[48+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[64+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[80+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[96+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[112+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[128+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[144+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[160+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[176+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[192+rcx] + vaesenclast zmm7,zmm7,zmm6 + vaesenclast zmm10,zmm10,zmm6 + vaesenclast zmm11,zmm11,zmm6 + vaesenclast zmm12,zmm12,zmm6 + + + vpxorq zmm7,zmm7,zmm0 + vpxorq zmm10,zmm10,zmm3 + vpxorq zmm11,zmm11,zmm4 + vpxorq zmm12,zmm12,zmm5 + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[256+r11*1+r10],zmm7 + vmovdqu8 ZMMWORD[320+r11*1+r10],zmm10 + vmovdqu8 ZMMWORD[384+r11*1+r10],zmm11 + vmovdqu8 ZMMWORD[448+r11*1+r10],zmm12 + + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vpshufb zmm12,zmm12,zmm29 + vmovdqa64 ZMMWORD[1024+rsp],zmm7 + vmovdqa64 ZMMWORD[1088+rsp],zmm10 + vmovdqa64 ZMMWORD[1152+rsp],zmm11 + vmovdqa64 ZMMWORD[1216+rsp],zmm12 + test r14,r14 + jnz NEAR $L$_skip_hkeys_precomputation_177 + vmovdqu64 zmm3,ZMMWORD[640+rsp] + + + vshufi64x2 zmm3,zmm3,zmm3,0x00 + + vmovdqu64 zmm4,ZMMWORD[576+rsp] + vmovdqu64 zmm5,ZMMWORD[512+rsp] + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[448+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[384+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[320+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[256+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[192+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[128+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[64+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[rsp],zmm5 +$L$_skip_hkeys_precomputation_177: + mov r14,1 + add r11,512 + sub r13,512 + + cmp r13,768 + jb NEAR $L$_no_more_big_nblocks_172 +$L$_encrypt_big_nblocks_172: + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_178 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_178 +$L$_16_blocks_overflow_178: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_178: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[192+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm26,zmm10,zmm15 + vpxorq zmm24,zmm6,zmm12 + vpxorq zmm25,zmm7,zmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqa64 ZMMWORD[1280+rsp],zmm0 + vmovdqa64 ZMMWORD[1344+rsp],zmm3 + vmovdqa64 ZMMWORD[1408+rsp],zmm4 + vmovdqa64 ZMMWORD[1472+rsp],zmm5 + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_179 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_179 +$L$_16_blocks_overflow_179: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_179: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[256+rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[320+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[384+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[448+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[256+r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[320+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[384+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[448+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vpternlogq zmm24,zmm6,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[256+r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[320+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[384+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[448+r11*1+r10],zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqa64 ZMMWORD[768+rsp],zmm0 + vmovdqa64 ZMMWORD[832+rsp],zmm3 + vmovdqa64 ZMMWORD[896+rsp],zmm4 + vmovdqa64 ZMMWORD[960+rsp],zmm5 + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_180 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_180 +$L$_16_blocks_overflow_180: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_180: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[512+r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[576+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[640+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[704+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + + + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpternlogq zmm6,zmm12,zmm15,0x96 + vpxorq zmm6,zmm6,zmm24 + vpternlogq zmm7,zmm13,zmm10,0x96 + vpxorq zmm7,zmm7,zmm25 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vextracti64x4 ymm12,zmm6,1 + vpxorq ymm6,ymm6,ymm12 + vextracti32x4 xmm12,ymm6,1 + vpxorq xmm6,xmm6,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm6,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[512+r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[576+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[640+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[704+r11*1+r10],zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqa64 ZMMWORD[1024+rsp],zmm0 + vmovdqa64 ZMMWORD[1088+rsp],zmm3 + vmovdqa64 ZMMWORD[1152+rsp],zmm4 + vmovdqa64 ZMMWORD[1216+rsp],zmm5 + vmovdqa64 zmm14,zmm6 + + add r11,768 + sub r13,768 + cmp r13,768 + jae NEAR $L$_encrypt_big_nblocks_172 + +$L$_no_more_big_nblocks_172: + + cmp r13,512 + jae NEAR $L$_encrypt_32_blocks_172 + + cmp r13,256 + jae NEAR $L$_encrypt_16_blocks_172 +$L$_encrypt_0_blocks_ghash_32_172: + mov r10d,r13d + and r10d,~15 + mov ebx,256 + sub ebx,r10d + vmovdqa64 zmm13,ZMMWORD[768+rsp] + vpxorq zmm13,zmm13,zmm14 + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[832+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpxorq zmm26,zmm4,zmm10 + vpxorq zmm24,zmm0,zmm6 + vpxorq zmm25,zmm3,zmm7 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[896+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[960+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + add ebx,256 + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_181 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_181 + jb NEAR $L$_last_num_blocks_is_7_1_181 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_181 + jb NEAR $L$_last_num_blocks_is_11_9_181 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_181 + ja NEAR $L$_last_num_blocks_is_16_181 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_181 + jmp NEAR $L$_last_num_blocks_is_13_181 + +$L$_last_num_blocks_is_11_9_181: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_181 + ja NEAR $L$_last_num_blocks_is_11_181 + jmp NEAR $L$_last_num_blocks_is_9_181 + +$L$_last_num_blocks_is_7_1_181: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_181 + jb NEAR $L$_last_num_blocks_is_3_1_181 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_181 + je NEAR $L$_last_num_blocks_is_6_181 + jmp NEAR $L$_last_num_blocks_is_5_181 + +$L$_last_num_blocks_is_3_1_181: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_181 + je NEAR $L$_last_num_blocks_is_2_181 +$L$_last_num_blocks_is_1_181: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_182 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_182 + +$L$_16_blocks_overflow_182: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_182: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc xmm0,xmm0,xmm31 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb xmm17,xmm0,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_183 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_183 +$L$_small_initial_partial_block_183: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm0,XMMWORD[POLY2] + + + vpclmulqdq xmm3,xmm0,xmm25,0x01 + vpslldq xmm3,xmm3,8 + vpxorq xmm3,xmm25,xmm3 + + + vpclmulqdq xmm4,xmm0,xmm3,0x00 + vpsrldq xmm4,xmm4,4 + vpclmulqdq xmm14,xmm0,xmm3,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm4,xmm24,0x96 + + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_183 +$L$_small_initial_compute_done_183: +$L$_after_reduction_183: + jmp NEAR $L$_last_blocks_done_181 +$L$_last_num_blocks_is_2_181: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_184 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_184 + +$L$_16_blocks_overflow_184: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_184: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc ymm0,ymm0,ymm31 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb ymm17,ymm0,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_185 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_185 +$L$_small_initial_partial_block_185: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_185: + + or r13,r13 + je NEAR $L$_after_reduction_185 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_185: + jmp NEAR $L$_last_blocks_done_181 +$L$_last_num_blocks_is_3_181: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_186 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_186 + +$L$_16_blocks_overflow_186: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_186: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_187 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_187 +$L$_small_initial_partial_block_187: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_187: + + or r13,r13 + je NEAR $L$_after_reduction_187 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_187: + jmp NEAR $L$_last_blocks_done_181 +$L$_last_num_blocks_is_4_181: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_188 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_188 + +$L$_16_blocks_overflow_188: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_188: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_189 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_189 +$L$_small_initial_partial_block_189: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_189: + + or r13,r13 + je NEAR $L$_after_reduction_189 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_189: + jmp NEAR $L$_last_blocks_done_181 +$L$_last_num_blocks_is_5_181: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_190 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_190 + +$L$_16_blocks_overflow_190: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_190: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb xmm19,xmm3,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_191 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_191 +$L$_small_initial_partial_block_191: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_191: + + or r13,r13 + je NEAR $L$_after_reduction_191 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_191: + jmp NEAR $L$_last_blocks_done_181 +$L$_last_num_blocks_is_6_181: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_192 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_192 + +$L$_16_blocks_overflow_192: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_192: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb ymm19,ymm3,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_193 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_193 +$L$_small_initial_partial_block_193: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_193: + + or r13,r13 + je NEAR $L$_after_reduction_193 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_193: + jmp NEAR $L$_last_blocks_done_181 +$L$_last_num_blocks_is_7_181: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_194 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_194 + +$L$_16_blocks_overflow_194: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_194: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_195 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_195 +$L$_small_initial_partial_block_195: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_195: + + or r13,r13 + je NEAR $L$_after_reduction_195 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_195: + jmp NEAR $L$_last_blocks_done_181 +$L$_last_num_blocks_is_8_181: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_196 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_196 + +$L$_16_blocks_overflow_196: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_196: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_197 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_197 +$L$_small_initial_partial_block_197: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_197: + + or r13,r13 + je NEAR $L$_after_reduction_197 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_197: + jmp NEAR $L$_last_blocks_done_181 +$L$_last_num_blocks_is_9_181: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_198 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_198 + +$L$_16_blocks_overflow_198: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_198: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb xmm20,xmm4,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_199 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_199 +$L$_small_initial_partial_block_199: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_199: + + or r13,r13 + je NEAR $L$_after_reduction_199 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_199: + jmp NEAR $L$_last_blocks_done_181 +$L$_last_num_blocks_is_10_181: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_200 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_200 + +$L$_16_blocks_overflow_200: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_200: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb ymm20,ymm4,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_201 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_201 +$L$_small_initial_partial_block_201: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_201: + + or r13,r13 + je NEAR $L$_after_reduction_201 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_201: + jmp NEAR $L$_last_blocks_done_181 +$L$_last_num_blocks_is_11_181: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_202 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_202 + +$L$_16_blocks_overflow_202: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_202: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_203 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_203 +$L$_small_initial_partial_block_203: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_203: + + or r13,r13 + je NEAR $L$_after_reduction_203 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_203: + jmp NEAR $L$_last_blocks_done_181 +$L$_last_num_blocks_is_12_181: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_204 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_204 + +$L$_16_blocks_overflow_204: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_204: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_205 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_205 +$L$_small_initial_partial_block_205: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_205: + + or r13,r13 + je NEAR $L$_after_reduction_205 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_205: + jmp NEAR $L$_last_blocks_done_181 +$L$_last_num_blocks_is_13_181: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_206 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_206 + +$L$_16_blocks_overflow_206: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_206: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb xmm21,xmm5,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_207 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_207 +$L$_small_initial_partial_block_207: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_207: + + or r13,r13 + je NEAR $L$_after_reduction_207 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_207: + jmp NEAR $L$_last_blocks_done_181 +$L$_last_num_blocks_is_14_181: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_208 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_208 + +$L$_16_blocks_overflow_208: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_208: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb ymm21,ymm5,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_209 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_209 +$L$_small_initial_partial_block_209: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_209: + + or r13,r13 + je NEAR $L$_after_reduction_209 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_209: + jmp NEAR $L$_last_blocks_done_181 +$L$_last_num_blocks_is_15_181: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_210 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_210 + +$L$_16_blocks_overflow_210: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_210: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_211 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_211 +$L$_small_initial_partial_block_211: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_211: + + or r13,r13 + je NEAR $L$_after_reduction_211 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_211: + jmp NEAR $L$_last_blocks_done_181 +$L$_last_num_blocks_is_16_181: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_212 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_212 + +$L$_16_blocks_overflow_212: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_212: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_213: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_213: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_213: + jmp NEAR $L$_last_blocks_done_181 +$L$_last_num_blocks_is_0_181: + vmovdqa64 zmm13,ZMMWORD[1024+rsp] + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1088+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1152+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1216+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_181: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_172 +$L$_encrypt_32_blocks_172: + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_214 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_214 +$L$_16_blocks_overflow_214: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_214: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[192+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm26,zmm10,zmm15 + vpxorq zmm24,zmm6,zmm12 + vpxorq zmm25,zmm7,zmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqa64 ZMMWORD[1280+rsp],zmm0 + vmovdqa64 ZMMWORD[1344+rsp],zmm3 + vmovdqa64 ZMMWORD[1408+rsp],zmm4 + vmovdqa64 ZMMWORD[1472+rsp],zmm5 + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_215 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_215 +$L$_16_blocks_overflow_215: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_215: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[256+rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[320+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[384+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[448+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[256+r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[320+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[384+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[448+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vpternlogq zmm24,zmm6,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[256+r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[320+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[384+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[448+r11*1+r10],zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqa64 ZMMWORD[768+rsp],zmm0 + vmovdqa64 ZMMWORD[832+rsp],zmm3 + vmovdqa64 ZMMWORD[896+rsp],zmm4 + vmovdqa64 ZMMWORD[960+rsp],zmm5 + vmovdqa64 zmm13,ZMMWORD[1280+rsp] + vmovdqu64 zmm12,ZMMWORD[512+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1344+rsp] + vmovdqu64 zmm12,ZMMWORD[576+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1408+rsp] + vmovdqu64 zmm12,ZMMWORD[640+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1472+rsp] + vmovdqu64 zmm12,ZMMWORD[704+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + + sub r13,512 + add r11,512 + mov r10d,r13d + and r10d,~15 + mov ebx,512 + sub ebx,r10d + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_216 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_216 + jb NEAR $L$_last_num_blocks_is_7_1_216 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_216 + jb NEAR $L$_last_num_blocks_is_11_9_216 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_216 + ja NEAR $L$_last_num_blocks_is_16_216 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_216 + jmp NEAR $L$_last_num_blocks_is_13_216 + +$L$_last_num_blocks_is_11_9_216: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_216 + ja NEAR $L$_last_num_blocks_is_11_216 + jmp NEAR $L$_last_num_blocks_is_9_216 + +$L$_last_num_blocks_is_7_1_216: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_216 + jb NEAR $L$_last_num_blocks_is_3_1_216 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_216 + je NEAR $L$_last_num_blocks_is_6_216 + jmp NEAR $L$_last_num_blocks_is_5_216 + +$L$_last_num_blocks_is_3_1_216: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_216 + je NEAR $L$_last_num_blocks_is_2_216 +$L$_last_num_blocks_is_1_216: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_217 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_217 + +$L$_16_blocks_overflow_217: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_217: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc xmm0,xmm0,xmm31 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb xmm17,xmm0,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_218 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_218 +$L$_small_initial_partial_block_218: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm0,XMMWORD[POLY2] + + + vpclmulqdq xmm3,xmm0,xmm25,0x01 + vpslldq xmm3,xmm3,8 + vpxorq xmm3,xmm25,xmm3 + + + vpclmulqdq xmm4,xmm0,xmm3,0x00 + vpsrldq xmm4,xmm4,4 + vpclmulqdq xmm14,xmm0,xmm3,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm4,xmm24,0x96 + + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_218 +$L$_small_initial_compute_done_218: +$L$_after_reduction_218: + jmp NEAR $L$_last_blocks_done_216 +$L$_last_num_blocks_is_2_216: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_219 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_219 + +$L$_16_blocks_overflow_219: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_219: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc ymm0,ymm0,ymm31 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb ymm17,ymm0,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_220 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_220 +$L$_small_initial_partial_block_220: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_220: + + or r13,r13 + je NEAR $L$_after_reduction_220 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_220: + jmp NEAR $L$_last_blocks_done_216 +$L$_last_num_blocks_is_3_216: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_221 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_221 + +$L$_16_blocks_overflow_221: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_221: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_222 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_222 +$L$_small_initial_partial_block_222: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_222: + + or r13,r13 + je NEAR $L$_after_reduction_222 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_222: + jmp NEAR $L$_last_blocks_done_216 +$L$_last_num_blocks_is_4_216: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_223 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_223 + +$L$_16_blocks_overflow_223: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_223: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_224 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_224 +$L$_small_initial_partial_block_224: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_224: + + or r13,r13 + je NEAR $L$_after_reduction_224 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_224: + jmp NEAR $L$_last_blocks_done_216 +$L$_last_num_blocks_is_5_216: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_225 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_225 + +$L$_16_blocks_overflow_225: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_225: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb xmm19,xmm3,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_226 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_226 +$L$_small_initial_partial_block_226: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_226: + + or r13,r13 + je NEAR $L$_after_reduction_226 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_226: + jmp NEAR $L$_last_blocks_done_216 +$L$_last_num_blocks_is_6_216: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_227 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_227 + +$L$_16_blocks_overflow_227: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_227: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb ymm19,ymm3,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_228 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_228 +$L$_small_initial_partial_block_228: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_228: + + or r13,r13 + je NEAR $L$_after_reduction_228 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_228: + jmp NEAR $L$_last_blocks_done_216 +$L$_last_num_blocks_is_7_216: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_229 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_229 + +$L$_16_blocks_overflow_229: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_229: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_230 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_230 +$L$_small_initial_partial_block_230: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_230: + + or r13,r13 + je NEAR $L$_after_reduction_230 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_230: + jmp NEAR $L$_last_blocks_done_216 +$L$_last_num_blocks_is_8_216: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_231 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_231 + +$L$_16_blocks_overflow_231: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_231: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_232 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_232 +$L$_small_initial_partial_block_232: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_232: + + or r13,r13 + je NEAR $L$_after_reduction_232 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_232: + jmp NEAR $L$_last_blocks_done_216 +$L$_last_num_blocks_is_9_216: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_233 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_233 + +$L$_16_blocks_overflow_233: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_233: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb xmm20,xmm4,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_234 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_234 +$L$_small_initial_partial_block_234: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_234: + + or r13,r13 + je NEAR $L$_after_reduction_234 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_234: + jmp NEAR $L$_last_blocks_done_216 +$L$_last_num_blocks_is_10_216: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_235 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_235 + +$L$_16_blocks_overflow_235: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_235: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb ymm20,ymm4,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_236 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_236 +$L$_small_initial_partial_block_236: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_236: + + or r13,r13 + je NEAR $L$_after_reduction_236 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_236: + jmp NEAR $L$_last_blocks_done_216 +$L$_last_num_blocks_is_11_216: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_237 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_237 + +$L$_16_blocks_overflow_237: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_237: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_238 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_238 +$L$_small_initial_partial_block_238: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_238: + + or r13,r13 + je NEAR $L$_after_reduction_238 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_238: + jmp NEAR $L$_last_blocks_done_216 +$L$_last_num_blocks_is_12_216: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_239 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_239 + +$L$_16_blocks_overflow_239: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_239: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_240 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_240 +$L$_small_initial_partial_block_240: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_240: + + or r13,r13 + je NEAR $L$_after_reduction_240 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_240: + jmp NEAR $L$_last_blocks_done_216 +$L$_last_num_blocks_is_13_216: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_241 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_241 + +$L$_16_blocks_overflow_241: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_241: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb xmm21,xmm5,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_242 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_242 +$L$_small_initial_partial_block_242: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_242: + + or r13,r13 + je NEAR $L$_after_reduction_242 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_242: + jmp NEAR $L$_last_blocks_done_216 +$L$_last_num_blocks_is_14_216: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_243 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_243 + +$L$_16_blocks_overflow_243: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_243: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb ymm21,ymm5,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_244 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_244 +$L$_small_initial_partial_block_244: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_244: + + or r13,r13 + je NEAR $L$_after_reduction_244 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_244: + jmp NEAR $L$_last_blocks_done_216 +$L$_last_num_blocks_is_15_216: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_245 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_245 + +$L$_16_blocks_overflow_245: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_245: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_246 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_246 +$L$_small_initial_partial_block_246: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_246: + + or r13,r13 + je NEAR $L$_after_reduction_246 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_246: + jmp NEAR $L$_last_blocks_done_216 +$L$_last_num_blocks_is_16_216: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_247 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_247 + +$L$_16_blocks_overflow_247: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_247: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_248: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_248: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_248: + jmp NEAR $L$_last_blocks_done_216 +$L$_last_num_blocks_is_0_216: + vmovdqa64 zmm13,ZMMWORD[768+rsp] + vpxorq zmm13,zmm13,zmm14 + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[832+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpxorq zmm26,zmm4,zmm10 + vpxorq zmm24,zmm0,zmm6 + vpxorq zmm25,zmm3,zmm7 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[896+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[960+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_216: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_172 +$L$_encrypt_16_blocks_172: + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_249 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_249 +$L$_16_blocks_overflow_249: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_249: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[192+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm26,zmm10,zmm15 + vpxorq zmm24,zmm6,zmm12 + vpxorq zmm25,zmm7,zmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqa64 ZMMWORD[1280+rsp],zmm0 + vmovdqa64 ZMMWORD[1344+rsp],zmm3 + vmovdqa64 ZMMWORD[1408+rsp],zmm4 + vmovdqa64 ZMMWORD[1472+rsp],zmm5 + vmovdqa64 zmm13,ZMMWORD[1024+rsp] + vmovdqu64 zmm12,ZMMWORD[256+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1088+rsp] + vmovdqu64 zmm12,ZMMWORD[320+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1152+rsp] + vmovdqu64 zmm12,ZMMWORD[384+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1216+rsp] + vmovdqu64 zmm12,ZMMWORD[448+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + sub r13,256 + add r11,256 + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_250 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_250 + jb NEAR $L$_last_num_blocks_is_7_1_250 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_250 + jb NEAR $L$_last_num_blocks_is_11_9_250 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_250 + ja NEAR $L$_last_num_blocks_is_16_250 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_250 + jmp NEAR $L$_last_num_blocks_is_13_250 + +$L$_last_num_blocks_is_11_9_250: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_250 + ja NEAR $L$_last_num_blocks_is_11_250 + jmp NEAR $L$_last_num_blocks_is_9_250 + +$L$_last_num_blocks_is_7_1_250: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_250 + jb NEAR $L$_last_num_blocks_is_3_1_250 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_250 + je NEAR $L$_last_num_blocks_is_6_250 + jmp NEAR $L$_last_num_blocks_is_5_250 + +$L$_last_num_blocks_is_3_1_250: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_250 + je NEAR $L$_last_num_blocks_is_2_250 +$L$_last_num_blocks_is_1_250: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_251 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_251 + +$L$_16_blocks_overflow_251: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_251: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc xmm0,xmm0,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc xmm0,xmm0,xmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb xmm17,xmm0,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_252 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_252 +$L$_small_initial_partial_block_252: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_252 +$L$_small_initial_compute_done_252: +$L$_after_reduction_252: + jmp NEAR $L$_last_blocks_done_250 +$L$_last_num_blocks_is_2_250: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_253 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_253 + +$L$_16_blocks_overflow_253: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_253: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc ymm0,ymm0,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc ymm0,ymm0,ymm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb ymm17,ymm0,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_254 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_254 +$L$_small_initial_partial_block_254: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_254: + + or r13,r13 + je NEAR $L$_after_reduction_254 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_254: + jmp NEAR $L$_last_blocks_done_250 +$L$_last_num_blocks_is_3_250: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_255 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_255 + +$L$_16_blocks_overflow_255: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_255: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_256 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_256 +$L$_small_initial_partial_block_256: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_256: + + or r13,r13 + je NEAR $L$_after_reduction_256 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_256: + jmp NEAR $L$_last_blocks_done_250 +$L$_last_num_blocks_is_4_250: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_257 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_257 + +$L$_16_blocks_overflow_257: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_257: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_258 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_258 +$L$_small_initial_partial_block_258: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_258: + + or r13,r13 + je NEAR $L$_after_reduction_258 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_258: + jmp NEAR $L$_last_blocks_done_250 +$L$_last_num_blocks_is_5_250: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_259 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_259 + +$L$_16_blocks_overflow_259: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_259: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb xmm19,xmm3,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_260 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_260 +$L$_small_initial_partial_block_260: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_260: + + or r13,r13 + je NEAR $L$_after_reduction_260 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_260: + jmp NEAR $L$_last_blocks_done_250 +$L$_last_num_blocks_is_6_250: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_261 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_261 + +$L$_16_blocks_overflow_261: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_261: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb ymm19,ymm3,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_262 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_262 +$L$_small_initial_partial_block_262: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_262: + + or r13,r13 + je NEAR $L$_after_reduction_262 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_262: + jmp NEAR $L$_last_blocks_done_250 +$L$_last_num_blocks_is_7_250: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_263 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_263 + +$L$_16_blocks_overflow_263: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_263: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_264 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_264 +$L$_small_initial_partial_block_264: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_264: + + or r13,r13 + je NEAR $L$_after_reduction_264 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_264: + jmp NEAR $L$_last_blocks_done_250 +$L$_last_num_blocks_is_8_250: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_265 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_265 + +$L$_16_blocks_overflow_265: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_265: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_266 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_266 +$L$_small_initial_partial_block_266: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_266: + + or r13,r13 + je NEAR $L$_after_reduction_266 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_266: + jmp NEAR $L$_last_blocks_done_250 +$L$_last_num_blocks_is_9_250: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_267 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_267 + +$L$_16_blocks_overflow_267: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_267: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb xmm20,xmm4,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_268 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_268 +$L$_small_initial_partial_block_268: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_268: + + or r13,r13 + je NEAR $L$_after_reduction_268 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_268: + jmp NEAR $L$_last_blocks_done_250 +$L$_last_num_blocks_is_10_250: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_269 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_269 + +$L$_16_blocks_overflow_269: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_269: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb ymm20,ymm4,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_270 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_270 +$L$_small_initial_partial_block_270: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_270: + + or r13,r13 + je NEAR $L$_after_reduction_270 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_270: + jmp NEAR $L$_last_blocks_done_250 +$L$_last_num_blocks_is_11_250: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_271 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_271 + +$L$_16_blocks_overflow_271: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_271: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_272 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_272 +$L$_small_initial_partial_block_272: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_272: + + or r13,r13 + je NEAR $L$_after_reduction_272 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_272: + jmp NEAR $L$_last_blocks_done_250 +$L$_last_num_blocks_is_12_250: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_273 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_273 + +$L$_16_blocks_overflow_273: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_273: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_274 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_274 +$L$_small_initial_partial_block_274: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_274: + + or r13,r13 + je NEAR $L$_after_reduction_274 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_274: + jmp NEAR $L$_last_blocks_done_250 +$L$_last_num_blocks_is_13_250: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_275 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_275 + +$L$_16_blocks_overflow_275: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_275: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb xmm21,xmm5,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_276 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_276 +$L$_small_initial_partial_block_276: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_276: + + or r13,r13 + je NEAR $L$_after_reduction_276 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_276: + jmp NEAR $L$_last_blocks_done_250 +$L$_last_num_blocks_is_14_250: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_277 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_277 + +$L$_16_blocks_overflow_277: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_277: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb ymm21,ymm5,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_278 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_278 +$L$_small_initial_partial_block_278: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_278: + + or r13,r13 + je NEAR $L$_after_reduction_278 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_278: + jmp NEAR $L$_last_blocks_done_250 +$L$_last_num_blocks_is_15_250: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_279 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_279 + +$L$_16_blocks_overflow_279: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_279: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_280 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_280 +$L$_small_initial_partial_block_280: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_280: + + or r13,r13 + je NEAR $L$_after_reduction_280 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_280: + jmp NEAR $L$_last_blocks_done_250 +$L$_last_num_blocks_is_16_250: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_281 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_281 + +$L$_16_blocks_overflow_281: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_281: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_282: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_282: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_282: + jmp NEAR $L$_last_blocks_done_250 +$L$_last_num_blocks_is_0_250: + vmovdqa64 zmm13,ZMMWORD[1280+rsp] + vmovdqu64 zmm12,ZMMWORD[512+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1344+rsp] + vmovdqu64 zmm12,ZMMWORD[576+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1408+rsp] + vmovdqu64 zmm12,ZMMWORD[640+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1472+rsp] + vmovdqu64 zmm12,ZMMWORD[704+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_250: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_172 + +$L$_message_below_32_blocks_172: + + + sub r13,256 + add r11,256 + mov r10d,r13d + test r14,r14 + jnz NEAR $L$_skip_hkeys_precomputation_283 + vmovdqu64 zmm3,ZMMWORD[640+rsp] + + + vshufi64x2 zmm3,zmm3,zmm3,0x00 + + vmovdqu64 zmm4,ZMMWORD[576+rsp] + vmovdqu64 zmm5,ZMMWORD[512+rsp] + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[448+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[384+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[320+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[256+rsp],zmm5 +$L$_skip_hkeys_precomputation_283: + mov r14,1 + and r10d,~15 + mov ebx,512 + sub ebx,r10d + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_284 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_284 + jb NEAR $L$_last_num_blocks_is_7_1_284 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_284 + jb NEAR $L$_last_num_blocks_is_11_9_284 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_284 + ja NEAR $L$_last_num_blocks_is_16_284 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_284 + jmp NEAR $L$_last_num_blocks_is_13_284 + +$L$_last_num_blocks_is_11_9_284: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_284 + ja NEAR $L$_last_num_blocks_is_11_284 + jmp NEAR $L$_last_num_blocks_is_9_284 + +$L$_last_num_blocks_is_7_1_284: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_284 + jb NEAR $L$_last_num_blocks_is_3_1_284 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_284 + je NEAR $L$_last_num_blocks_is_6_284 + jmp NEAR $L$_last_num_blocks_is_5_284 + +$L$_last_num_blocks_is_3_1_284: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_284 + je NEAR $L$_last_num_blocks_is_2_284 +$L$_last_num_blocks_is_1_284: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_285 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_285 + +$L$_16_blocks_overflow_285: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_285: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc xmm0,xmm0,xmm31 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb xmm17,xmm0,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_286 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_286 +$L$_small_initial_partial_block_286: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm0,XMMWORD[POLY2] + + + vpclmulqdq xmm3,xmm0,xmm25,0x01 + vpslldq xmm3,xmm3,8 + vpxorq xmm3,xmm25,xmm3 + + + vpclmulqdq xmm4,xmm0,xmm3,0x00 + vpsrldq xmm4,xmm4,4 + vpclmulqdq xmm14,xmm0,xmm3,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm4,xmm24,0x96 + + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_286 +$L$_small_initial_compute_done_286: +$L$_after_reduction_286: + jmp NEAR $L$_last_blocks_done_284 +$L$_last_num_blocks_is_2_284: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_287 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_287 + +$L$_16_blocks_overflow_287: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_287: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc ymm0,ymm0,ymm31 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb ymm17,ymm0,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_288 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_288 +$L$_small_initial_partial_block_288: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_288: + + or r13,r13 + je NEAR $L$_after_reduction_288 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_288: + jmp NEAR $L$_last_blocks_done_284 +$L$_last_num_blocks_is_3_284: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_289 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_289 + +$L$_16_blocks_overflow_289: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_289: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_290 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_290 +$L$_small_initial_partial_block_290: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_290: + + or r13,r13 + je NEAR $L$_after_reduction_290 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_290: + jmp NEAR $L$_last_blocks_done_284 +$L$_last_num_blocks_is_4_284: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_291 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_291 + +$L$_16_blocks_overflow_291: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_291: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_292 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_292 +$L$_small_initial_partial_block_292: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_292: + + or r13,r13 + je NEAR $L$_after_reduction_292 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_292: + jmp NEAR $L$_last_blocks_done_284 +$L$_last_num_blocks_is_5_284: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_293 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_293 + +$L$_16_blocks_overflow_293: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_293: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb xmm19,xmm3,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_294 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_294 +$L$_small_initial_partial_block_294: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_294: + + or r13,r13 + je NEAR $L$_after_reduction_294 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_294: + jmp NEAR $L$_last_blocks_done_284 +$L$_last_num_blocks_is_6_284: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_295 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_295 + +$L$_16_blocks_overflow_295: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_295: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb ymm19,ymm3,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_296 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_296 +$L$_small_initial_partial_block_296: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_296: + + or r13,r13 + je NEAR $L$_after_reduction_296 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_296: + jmp NEAR $L$_last_blocks_done_284 +$L$_last_num_blocks_is_7_284: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_297 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_297 + +$L$_16_blocks_overflow_297: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_297: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_298 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_298 +$L$_small_initial_partial_block_298: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_298: + + or r13,r13 + je NEAR $L$_after_reduction_298 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_298: + jmp NEAR $L$_last_blocks_done_284 +$L$_last_num_blocks_is_8_284: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_299 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_299 + +$L$_16_blocks_overflow_299: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_299: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_300 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_300 +$L$_small_initial_partial_block_300: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_300: + + or r13,r13 + je NEAR $L$_after_reduction_300 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_300: + jmp NEAR $L$_last_blocks_done_284 +$L$_last_num_blocks_is_9_284: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_301 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_301 + +$L$_16_blocks_overflow_301: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_301: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb xmm20,xmm4,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_302 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_302 +$L$_small_initial_partial_block_302: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_302: + + or r13,r13 + je NEAR $L$_after_reduction_302 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_302: + jmp NEAR $L$_last_blocks_done_284 +$L$_last_num_blocks_is_10_284: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_303 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_303 + +$L$_16_blocks_overflow_303: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_303: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb ymm20,ymm4,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_304 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_304 +$L$_small_initial_partial_block_304: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_304: + + or r13,r13 + je NEAR $L$_after_reduction_304 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_304: + jmp NEAR $L$_last_blocks_done_284 +$L$_last_num_blocks_is_11_284: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_305 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_305 + +$L$_16_blocks_overflow_305: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_305: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_306 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_306 +$L$_small_initial_partial_block_306: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_306: + + or r13,r13 + je NEAR $L$_after_reduction_306 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_306: + jmp NEAR $L$_last_blocks_done_284 +$L$_last_num_blocks_is_12_284: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_307 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_307 + +$L$_16_blocks_overflow_307: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_307: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_308 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_308 +$L$_small_initial_partial_block_308: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_308: + + or r13,r13 + je NEAR $L$_after_reduction_308 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_308: + jmp NEAR $L$_last_blocks_done_284 +$L$_last_num_blocks_is_13_284: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_309 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_309 + +$L$_16_blocks_overflow_309: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_309: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb xmm21,xmm5,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_310 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_310 +$L$_small_initial_partial_block_310: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_310: + + or r13,r13 + je NEAR $L$_after_reduction_310 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_310: + jmp NEAR $L$_last_blocks_done_284 +$L$_last_num_blocks_is_14_284: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_311 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_311 + +$L$_16_blocks_overflow_311: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_311: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb ymm21,ymm5,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_312 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_312 +$L$_small_initial_partial_block_312: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_312: + + or r13,r13 + je NEAR $L$_after_reduction_312 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_312: + jmp NEAR $L$_last_blocks_done_284 +$L$_last_num_blocks_is_15_284: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_313 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_313 + +$L$_16_blocks_overflow_313: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_313: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_314 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_314 +$L$_small_initial_partial_block_314: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_314: + + or r13,r13 + je NEAR $L$_after_reduction_314 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_314: + jmp NEAR $L$_last_blocks_done_284 +$L$_last_num_blocks_is_16_284: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_315 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_315 + +$L$_16_blocks_overflow_315: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_315: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_316: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_316: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_316: + jmp NEAR $L$_last_blocks_done_284 +$L$_last_num_blocks_is_0_284: + vmovdqa64 zmm13,ZMMWORD[768+rsp] + vpxorq zmm13,zmm13,zmm14 + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[832+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpxorq zmm26,zmm4,zmm10 + vpxorq zmm24,zmm0,zmm6 + vpxorq zmm25,zmm3,zmm7 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[896+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[960+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_284: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_172 + +$L$_message_below_equal_16_blocks_172: + + + mov r12d,r13d + add r12d,15 + shr r12d,4 + cmp r12,8 + je NEAR $L$_small_initial_num_blocks_is_8_317 + jl NEAR $L$_small_initial_num_blocks_is_7_1_317 + + + cmp r12,12 + je NEAR $L$_small_initial_num_blocks_is_12_317 + jl NEAR $L$_small_initial_num_blocks_is_11_9_317 + + + cmp r12,16 + je NEAR $L$_small_initial_num_blocks_is_16_317 + cmp r12,15 + je NEAR $L$_small_initial_num_blocks_is_15_317 + cmp r12,14 + je NEAR $L$_small_initial_num_blocks_is_14_317 + jmp NEAR $L$_small_initial_num_blocks_is_13_317 + +$L$_small_initial_num_blocks_is_11_9_317: + + cmp r12,11 + je NEAR $L$_small_initial_num_blocks_is_11_317 + cmp r12,10 + je NEAR $L$_small_initial_num_blocks_is_10_317 + jmp NEAR $L$_small_initial_num_blocks_is_9_317 + +$L$_small_initial_num_blocks_is_7_1_317: + cmp r12,4 + je NEAR $L$_small_initial_num_blocks_is_4_317 + jl NEAR $L$_small_initial_num_blocks_is_3_1_317 + + cmp r12,7 + je NEAR $L$_small_initial_num_blocks_is_7_317 + cmp r12,6 + je NEAR $L$_small_initial_num_blocks_is_6_317 + jmp NEAR $L$_small_initial_num_blocks_is_5_317 + +$L$_small_initial_num_blocks_is_3_1_317: + + cmp r12,3 + je NEAR $L$_small_initial_num_blocks_is_3_317 + cmp r12,2 + je NEAR $L$_small_initial_num_blocks_is_2_317 + + + + + +$L$_small_initial_num_blocks_is_1_317: + vmovdqa64 xmm29,XMMWORD[SHUF_MASK] + vpaddd xmm0,xmm2,XMMWORD[ONE] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,0 + vpshufb xmm0,xmm0,xmm29 + vmovdqu8 xmm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast xmm0,xmm0,xmm15 + vpxorq xmm0,xmm0,xmm6 + vextracti32x4 xmm12,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb xmm6,xmm0,xmm29 + vextracti32x4 xmm13,zmm6,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_318 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm6,xmm20,0x01 + vpclmulqdq xmm5,xmm6,xmm20,0x10 + vpclmulqdq xmm0,xmm6,xmm20,0x11 + vpclmulqdq xmm3,xmm6,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_318 +$L$_small_initial_partial_block_318: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm13 + + jmp NEAR $L$_after_reduction_318 +$L$_small_initial_compute_done_318: +$L$_after_reduction_318: + jmp NEAR $L$_small_initial_blocks_encrypted_317 +$L$_small_initial_num_blocks_is_2_317: + vmovdqa64 ymm29,YMMWORD[SHUF_MASK] + vshufi64x2 ymm0,ymm2,ymm2,0 + vpaddd ymm0,ymm0,YMMWORD[ddq_add_1234] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,1 + vpshufb ymm0,ymm0,ymm29 + vmovdqu8 ymm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast ymm0,ymm0,ymm15 + vpxorq ymm0,ymm0,ymm6 + vextracti32x4 xmm12,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb ymm6,ymm0,ymm29 + vextracti32x4 xmm13,zmm6,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_319 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm6,ymm20,0x01 + vpclmulqdq ymm5,ymm6,ymm20,0x10 + vpclmulqdq ymm0,ymm6,ymm20,0x11 + vpclmulqdq ymm3,ymm6,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_319 +$L$_small_initial_partial_block_319: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm6,xmm20,0x01 + vpclmulqdq xmm5,xmm6,xmm20,0x10 + vpclmulqdq xmm0,xmm6,xmm20,0x11 + vpclmulqdq xmm3,xmm6,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_319: + + or r13,r13 + je NEAR $L$_after_reduction_319 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_319: + jmp NEAR $L$_small_initial_blocks_encrypted_317 +$L$_small_initial_num_blocks_is_3_317: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,2 + vpshufb zmm0,zmm0,zmm29 + vmovdqu8 zmm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vpxorq zmm0,zmm0,zmm6 + vextracti32x4 xmm12,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm6,zmm0,zmm29 + vextracti32x4 xmm13,zmm6,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_320 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_320 +$L$_small_initial_partial_block_320: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm6,ymm20,0x01 + vpclmulqdq ymm5,ymm6,ymm20,0x10 + vpclmulqdq ymm0,ymm6,ymm20,0x11 + vpclmulqdq ymm3,ymm6,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_320: + + or r13,r13 + je NEAR $L$_after_reduction_320 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_320: + jmp NEAR $L$_small_initial_blocks_encrypted_317 +$L$_small_initial_num_blocks_is_4_317: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,3 + vpshufb zmm0,zmm0,zmm29 + vmovdqu8 zmm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vpxorq zmm0,zmm0,zmm6 + vextracti32x4 xmm12,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm6,zmm0,zmm29 + vextracti32x4 xmm13,zmm6,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_321 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_321 +$L$_small_initial_partial_block_321: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_321: + + or r13,r13 + je NEAR $L$_after_reduction_321 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_321: + jmp NEAR $L$_small_initial_blocks_encrypted_317 +$L$_small_initial_num_blocks_is_5_317: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,0 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 xmm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast xmm3,xmm3,xmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq xmm3,xmm3,xmm7 + vextracti32x4 xmm12,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm0,zmm29 + vpshufb xmm7,xmm3,xmm29 + vextracti32x4 xmm13,zmm7,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_322 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm7,xmm20,0x01 + vpclmulqdq xmm5,xmm7,xmm20,0x10 + vpclmulqdq xmm0,xmm7,xmm20,0x11 + vpclmulqdq xmm3,xmm7,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_322 +$L$_small_initial_partial_block_322: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_322: + + or r13,r13 + je NEAR $L$_after_reduction_322 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_322: + jmp NEAR $L$_small_initial_blocks_encrypted_317 +$L$_small_initial_num_blocks_is_6_317: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,1 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 ymm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast ymm3,ymm3,ymm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq ymm3,ymm3,ymm7 + vextracti32x4 xmm12,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm0,zmm29 + vpshufb ymm7,ymm3,ymm29 + vextracti32x4 xmm13,zmm7,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_323 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm7,ymm20,0x01 + vpclmulqdq ymm5,ymm7,ymm20,0x10 + vpclmulqdq ymm0,ymm7,ymm20,0x11 + vpclmulqdq ymm3,ymm7,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_323 +$L$_small_initial_partial_block_323: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm7,xmm20,0x01 + vpclmulqdq xmm5,xmm7,xmm20,0x10 + vpclmulqdq xmm0,xmm7,xmm20,0x11 + vpclmulqdq xmm3,xmm7,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_323: + + or r13,r13 + je NEAR $L$_after_reduction_323 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_323: + jmp NEAR $L$_small_initial_blocks_encrypted_317 +$L$_small_initial_num_blocks_is_7_317: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,2 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vextracti32x4 xmm12,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vextracti32x4 xmm13,zmm7,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_324 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm7,zmm20,0x01 + vpclmulqdq zmm5,zmm7,zmm20,0x10 + vpclmulqdq zmm0,zmm7,zmm20,0x11 + vpclmulqdq zmm3,zmm7,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_324 +$L$_small_initial_partial_block_324: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm7,ymm20,0x01 + vpclmulqdq ymm5,ymm7,ymm20,0x10 + vpclmulqdq ymm0,ymm7,ymm20,0x11 + vpclmulqdq ymm3,ymm7,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_324: + + or r13,r13 + je NEAR $L$_after_reduction_324 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_324: + jmp NEAR $L$_small_initial_blocks_encrypted_317 +$L$_small_initial_num_blocks_is_8_317: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,3 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vextracti32x4 xmm12,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vextracti32x4 xmm13,zmm7,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_325 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_325 +$L$_small_initial_partial_block_325: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm7,zmm20,0x01 + vpclmulqdq zmm5,zmm7,zmm20,0x10 + vpclmulqdq zmm0,zmm7,zmm20,0x11 + vpclmulqdq zmm3,zmm7,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_325: + + or r13,r13 + je NEAR $L$_after_reduction_325 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_325: + jmp NEAR $L$_small_initial_blocks_encrypted_317 +$L$_small_initial_num_blocks_is_9_317: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,0 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast xmm4,xmm4,xmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq xmm4,xmm4,xmm10 + vextracti32x4 xmm12,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb xmm10,xmm4,xmm29 + vextracti32x4 xmm13,zmm10,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_326 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm10,xmm20,0x01 + vpclmulqdq xmm5,xmm10,xmm20,0x10 + vpclmulqdq xmm0,xmm10,xmm20,0x11 + vpclmulqdq xmm3,xmm10,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_326 +$L$_small_initial_partial_block_326: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_326: + + or r13,r13 + je NEAR $L$_after_reduction_326 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_326: + jmp NEAR $L$_small_initial_blocks_encrypted_317 +$L$_small_initial_num_blocks_is_10_317: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,1 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast ymm4,ymm4,ymm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq ymm4,ymm4,ymm10 + vextracti32x4 xmm12,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb ymm10,ymm4,ymm29 + vextracti32x4 xmm13,zmm10,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_327 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm10,ymm20,0x01 + vpclmulqdq ymm5,ymm10,ymm20,0x10 + vpclmulqdq ymm0,ymm10,ymm20,0x11 + vpclmulqdq ymm3,ymm10,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_327 +$L$_small_initial_partial_block_327: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm10,xmm20,0x01 + vpclmulqdq xmm5,xmm10,xmm20,0x10 + vpclmulqdq xmm0,xmm10,xmm20,0x11 + vpclmulqdq xmm3,xmm10,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_327: + + or r13,r13 + je NEAR $L$_after_reduction_327 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_327: + jmp NEAR $L$_small_initial_blocks_encrypted_317 +$L$_small_initial_num_blocks_is_11_317: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,2 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vextracti32x4 xmm12,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb zmm10,zmm4,zmm29 + vextracti32x4 xmm13,zmm10,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_328 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm10,zmm20,0x01 + vpclmulqdq zmm5,zmm10,zmm20,0x10 + vpclmulqdq zmm0,zmm10,zmm20,0x11 + vpclmulqdq zmm3,zmm10,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_328 +$L$_small_initial_partial_block_328: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm10,ymm20,0x01 + vpclmulqdq ymm5,ymm10,ymm20,0x10 + vpclmulqdq ymm0,ymm10,ymm20,0x11 + vpclmulqdq ymm3,ymm10,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_328: + + or r13,r13 + je NEAR $L$_after_reduction_328 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_328: + jmp NEAR $L$_small_initial_blocks_encrypted_317 +$L$_small_initial_num_blocks_is_12_317: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,3 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vextracti32x4 xmm12,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb zmm10,zmm4,zmm29 + vextracti32x4 xmm13,zmm10,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_329 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_329 +$L$_small_initial_partial_block_329: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm10,zmm20,0x01 + vpclmulqdq zmm5,zmm10,zmm20,0x10 + vpclmulqdq zmm0,zmm10,zmm20,0x11 + vpclmulqdq zmm3,zmm10,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_329: + + or r13,r13 + je NEAR $L$_after_reduction_329 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_329: + jmp NEAR $L$_small_initial_blocks_encrypted_317 +$L$_small_initial_num_blocks_is_13_317: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,0 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast xmm5,xmm5,xmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq xmm5,xmm5,xmm11 + vextracti32x4 xmm12,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb zmm10,zmm4,zmm29 + vpshufb xmm11,xmm5,xmm29 + vextracti32x4 xmm13,zmm11,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_330 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm11,xmm20,0x01 + vpclmulqdq xmm5,xmm11,xmm20,0x10 + vpclmulqdq xmm0,xmm11,xmm20,0x11 + vpclmulqdq xmm3,xmm11,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_330 +$L$_small_initial_partial_block_330: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_330: + + or r13,r13 + je NEAR $L$_after_reduction_330 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_330: + jmp NEAR $L$_small_initial_blocks_encrypted_317 +$L$_small_initial_num_blocks_is_14_317: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,1 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast ymm5,ymm5,ymm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq ymm5,ymm5,ymm11 + vextracti32x4 xmm12,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb zmm10,zmm4,zmm29 + vpshufb ymm11,ymm5,ymm29 + vextracti32x4 xmm13,zmm11,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_331 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm11,ymm20,0x01 + vpclmulqdq ymm5,ymm11,ymm20,0x10 + vpclmulqdq ymm0,ymm11,ymm20,0x11 + vpclmulqdq ymm3,ymm11,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_331 +$L$_small_initial_partial_block_331: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm11,xmm20,0x01 + vpclmulqdq xmm5,xmm11,xmm20,0x10 + vpclmulqdq xmm0,xmm11,xmm20,0x11 + vpclmulqdq xmm3,xmm11,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_331: + + or r13,r13 + je NEAR $L$_after_reduction_331 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_331: + jmp NEAR $L$_small_initial_blocks_encrypted_317 +$L$_small_initial_num_blocks_is_15_317: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,2 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast zmm5,zmm5,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq zmm5,zmm5,zmm11 + vextracti32x4 xmm12,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb zmm10,zmm4,zmm29 + vpshufb zmm11,zmm5,zmm29 + vextracti32x4 xmm13,zmm11,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_332 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm11,zmm20,0x01 + vpclmulqdq zmm5,zmm11,zmm20,0x10 + vpclmulqdq zmm0,zmm11,zmm20,0x11 + vpclmulqdq zmm3,zmm11,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_332 +$L$_small_initial_partial_block_332: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm11,ymm20,0x01 + vpclmulqdq ymm5,ymm11,ymm20,0x10 + vpclmulqdq ymm0,ymm11,ymm20,0x11 + vpclmulqdq ymm3,ymm11,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_332: + + or r13,r13 + je NEAR $L$_after_reduction_332 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_332: + jmp NEAR $L$_small_initial_blocks_encrypted_317 +$L$_small_initial_num_blocks_is_16_317: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,3 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast zmm5,zmm5,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq zmm5,zmm5,zmm11 + vextracti32x4 xmm12,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb zmm10,zmm4,zmm29 + vpshufb zmm11,zmm5,zmm29 + vextracti32x4 xmm13,zmm11,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_333: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm11,zmm20,0x01 + vpclmulqdq zmm5,zmm11,zmm20,0x10 + vpclmulqdq zmm0,zmm11,zmm20,0x11 + vpclmulqdq zmm3,zmm11,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_333: + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_333: +$L$_small_initial_blocks_encrypted_317: +$L$_ghash_done_172: + vmovdqu64 XMMWORD[rdx],xmm2 + vmovdqu64 XMMWORD[64+rdx],xmm14 +$L$_enc_dec_done_172: + jmp NEAR $L$exit_gcm_encrypt +ALIGN 32 +$L$aes_gcm_encrypt_256_avx512: + cmp QWORD[112+rbp],0 + je NEAR $L$_enc_dec_done_334 + xor r14,r14 + vmovdqu64 xmm14,XMMWORD[64+rdx] + + mov r11,QWORD[r8] + or r11,r11 + je NEAR $L$_partial_block_done_335 + mov r10d,16 + lea r12,[byte_len_to_mask_table] + cmp QWORD[112+rbp],r10 + cmovc r10,QWORD[112+rbp] + add r12,r10 + add r12,r10 + kmovw k1,[r12] + vmovdqu8 xmm0{k1}{z},[r9] + + vmovdqu64 xmm3,XMMWORD[16+rdx] + vmovdqu64 xmm4,XMMWORD[336+rdx] + + + + lea r12,[SHIFT_MASK] + add r12,r11 + vmovdqu64 xmm5,XMMWORD[r12] + vpshufb xmm3,xmm3,xmm5 + vpxorq xmm3,xmm3,xmm0 + + + mov r13,QWORD[112+rbp] + add r13,r11 + sub r13,16 + jge NEAR $L$_no_extra_mask_335 + sub r12,r13 +$L$_no_extra_mask_335: + + + + vmovdqu64 xmm0,XMMWORD[16+r12] + vpand xmm3,xmm3,xmm0 + vpshufb xmm3,xmm3,XMMWORD[SHUF_MASK] + vpshufb xmm3,xmm3,xmm5 + vpxorq xmm14,xmm14,xmm3 + cmp r13,0 + jl NEAR $L$_partial_incomplete_335 + + vpclmulqdq xmm7,xmm14,xmm4,0x11 + vpclmulqdq xmm10,xmm14,xmm4,0x00 + vpclmulqdq xmm11,xmm14,xmm4,0x01 + vpclmulqdq xmm14,xmm14,xmm4,0x10 + vpxorq xmm14,xmm14,xmm11 + + vpsrldq xmm11,xmm14,8 + vpslldq xmm14,xmm14,8 + vpxorq xmm7,xmm7,xmm11 + vpxorq xmm14,xmm14,xmm10 + + + + vmovdqu64 xmm11,XMMWORD[POLY2] + + vpclmulqdq xmm10,xmm11,xmm14,0x01 + vpslldq xmm10,xmm10,8 + vpxorq xmm14,xmm14,xmm10 + + + + vpclmulqdq xmm10,xmm11,xmm14,0x00 + vpsrldq xmm10,xmm10,4 + vpclmulqdq xmm14,xmm11,xmm14,0x10 + vpslldq xmm14,xmm14,4 + + vpternlogq xmm14,xmm7,xmm10,0x96 + + mov QWORD[r8],0 + + mov r12,r11 + mov r11,16 + sub r11,r12 + jmp NEAR $L$_enc_dec_done_335 + +$L$_partial_incomplete_335: + mov r12,QWORD[112+rbp] + add QWORD[r8],r12 + mov r11,QWORD[112+rbp] + +$L$_enc_dec_done_335: + + + lea r12,[byte_len_to_mask_table] + kmovw k1,[r11*2+r12] + vmovdqu64 XMMWORD[64+rdx],xmm14 + + vpshufb xmm3,xmm3,XMMWORD[SHUF_MASK] + vpshufb xmm3,xmm3,xmm5 + mov r12,QWORD[120+rbp] + vmovdqu8 XMMWORD[r12]{k1},xmm3 +$L$_partial_block_done_335: + vmovdqu64 xmm2,XMMWORD[rdx] + mov r13,QWORD[112+rbp] + sub r13,r11 + je NEAR $L$_enc_dec_done_334 + cmp r13,256 + jbe NEAR $L$_message_below_equal_16_blocks_334 + + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vmovdqa64 zmm27,ZMMWORD[ddq_addbe_4444] + vmovdqa64 zmm28,ZMMWORD[ddq_addbe_1234] + + + + + + + vmovd r15d,xmm2 + and r15d,255 + + vshufi64x2 zmm2,zmm2,zmm2,0 + vpshufb zmm2,zmm2,zmm29 + + + + cmp r15b,240 + jae NEAR $L$_next_16_overflow_336 + vpaddd zmm7,zmm2,zmm28 + vpaddd zmm10,zmm7,zmm27 + vpaddd zmm11,zmm10,zmm27 + vpaddd zmm12,zmm11,zmm27 + jmp NEAR $L$_next_16_ok_336 +$L$_next_16_overflow_336: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm12,ZMMWORD[ddq_add_4444] + vpaddd zmm7,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm10,zmm7,zmm12 + vpaddd zmm11,zmm10,zmm12 + vpaddd zmm12,zmm11,zmm12 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vpshufb zmm12,zmm12,zmm29 +$L$_next_16_ok_336: + vshufi64x2 zmm2,zmm12,zmm12,255 + add r15b,16 + + vmovdqu8 zmm0,ZMMWORD[r11*1+r9] + vmovdqu8 zmm3,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm4,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm5,ZMMWORD[192+r11*1+r9] + + + vbroadcastf64x2 zmm6,ZMMWORD[rcx] + vpxorq zmm7,zmm7,zmm6 + vpxorq zmm10,zmm10,zmm6 + vpxorq zmm11,zmm11,zmm6 + vpxorq zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[16+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[32+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[48+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[64+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[80+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[96+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[112+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[128+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[144+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[160+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[176+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[192+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[208+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[224+rcx] + vaesenclast zmm7,zmm7,zmm6 + vaesenclast zmm10,zmm10,zmm6 + vaesenclast zmm11,zmm11,zmm6 + vaesenclast zmm12,zmm12,zmm6 + + + vpxorq zmm7,zmm7,zmm0 + vpxorq zmm10,zmm10,zmm3 + vpxorq zmm11,zmm11,zmm4 + vpxorq zmm12,zmm12,zmm5 + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm7 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm10 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm11 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm12 + + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vpshufb zmm12,zmm12,zmm29 + vmovdqa64 ZMMWORD[768+rsp],zmm7 + vmovdqa64 ZMMWORD[832+rsp],zmm10 + vmovdqa64 ZMMWORD[896+rsp],zmm11 + vmovdqa64 ZMMWORD[960+rsp],zmm12 + test r14,r14 + jnz NEAR $L$_skip_hkeys_precomputation_337 + + vmovdqu64 zmm0,ZMMWORD[288+rdx] + vmovdqu64 ZMMWORD[704+rsp],zmm0 + + vmovdqu64 zmm3,ZMMWORD[224+rdx] + vmovdqu64 ZMMWORD[640+rsp],zmm3 + + + vshufi64x2 zmm3,zmm3,zmm3,0x00 + + vmovdqu64 zmm4,ZMMWORD[160+rdx] + vmovdqu64 ZMMWORD[576+rsp],zmm4 + + vmovdqu64 zmm5,ZMMWORD[96+rdx] + vmovdqu64 ZMMWORD[512+rsp],zmm5 +$L$_skip_hkeys_precomputation_337: + cmp r13,512 + jb NEAR $L$_message_below_32_blocks_334 + + + + cmp r15b,240 + jae NEAR $L$_next_16_overflow_338 + vpaddd zmm7,zmm2,zmm28 + vpaddd zmm10,zmm7,zmm27 + vpaddd zmm11,zmm10,zmm27 + vpaddd zmm12,zmm11,zmm27 + jmp NEAR $L$_next_16_ok_338 +$L$_next_16_overflow_338: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm12,ZMMWORD[ddq_add_4444] + vpaddd zmm7,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm10,zmm7,zmm12 + vpaddd zmm11,zmm10,zmm12 + vpaddd zmm12,zmm11,zmm12 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vpshufb zmm12,zmm12,zmm29 +$L$_next_16_ok_338: + vshufi64x2 zmm2,zmm12,zmm12,255 + add r15b,16 + + vmovdqu8 zmm0,ZMMWORD[256+r11*1+r9] + vmovdqu8 zmm3,ZMMWORD[320+r11*1+r9] + vmovdqu8 zmm4,ZMMWORD[384+r11*1+r9] + vmovdqu8 zmm5,ZMMWORD[448+r11*1+r9] + + + vbroadcastf64x2 zmm6,ZMMWORD[rcx] + vpxorq zmm7,zmm7,zmm6 + vpxorq zmm10,zmm10,zmm6 + vpxorq zmm11,zmm11,zmm6 + vpxorq zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[16+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[32+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[48+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[64+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[80+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[96+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[112+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[128+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[144+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[160+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[176+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[192+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[208+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[224+rcx] + vaesenclast zmm7,zmm7,zmm6 + vaesenclast zmm10,zmm10,zmm6 + vaesenclast zmm11,zmm11,zmm6 + vaesenclast zmm12,zmm12,zmm6 + + + vpxorq zmm7,zmm7,zmm0 + vpxorq zmm10,zmm10,zmm3 + vpxorq zmm11,zmm11,zmm4 + vpxorq zmm12,zmm12,zmm5 + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[256+r11*1+r10],zmm7 + vmovdqu8 ZMMWORD[320+r11*1+r10],zmm10 + vmovdqu8 ZMMWORD[384+r11*1+r10],zmm11 + vmovdqu8 ZMMWORD[448+r11*1+r10],zmm12 + + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vpshufb zmm12,zmm12,zmm29 + vmovdqa64 ZMMWORD[1024+rsp],zmm7 + vmovdqa64 ZMMWORD[1088+rsp],zmm10 + vmovdqa64 ZMMWORD[1152+rsp],zmm11 + vmovdqa64 ZMMWORD[1216+rsp],zmm12 + test r14,r14 + jnz NEAR $L$_skip_hkeys_precomputation_339 + vmovdqu64 zmm3,ZMMWORD[640+rsp] + + + vshufi64x2 zmm3,zmm3,zmm3,0x00 + + vmovdqu64 zmm4,ZMMWORD[576+rsp] + vmovdqu64 zmm5,ZMMWORD[512+rsp] + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[448+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[384+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[320+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[256+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[192+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[128+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[64+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[rsp],zmm5 +$L$_skip_hkeys_precomputation_339: + mov r14,1 + add r11,512 + sub r13,512 + + cmp r13,768 + jb NEAR $L$_no_more_big_nblocks_334 +$L$_encrypt_big_nblocks_334: + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_340 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_340 +$L$_16_blocks_overflow_340: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_340: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[192+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm26,zmm10,zmm15 + vpxorq zmm24,zmm6,zmm12 + vpxorq zmm25,zmm7,zmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqa64 ZMMWORD[1280+rsp],zmm0 + vmovdqa64 ZMMWORD[1344+rsp],zmm3 + vmovdqa64 ZMMWORD[1408+rsp],zmm4 + vmovdqa64 ZMMWORD[1472+rsp],zmm5 + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_341 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_341 +$L$_16_blocks_overflow_341: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_341: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[256+rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[320+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[384+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[448+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[256+r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[320+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[384+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[448+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vpternlogq zmm24,zmm6,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[256+r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[320+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[384+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[448+r11*1+r10],zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqa64 ZMMWORD[768+rsp],zmm0 + vmovdqa64 ZMMWORD[832+rsp],zmm3 + vmovdqa64 ZMMWORD[896+rsp],zmm4 + vmovdqa64 ZMMWORD[960+rsp],zmm5 + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_342 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_342 +$L$_16_blocks_overflow_342: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_342: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[512+r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[576+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[640+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[704+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + + + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpternlogq zmm6,zmm12,zmm15,0x96 + vpxorq zmm6,zmm6,zmm24 + vpternlogq zmm7,zmm13,zmm10,0x96 + vpxorq zmm7,zmm7,zmm25 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vextracti64x4 ymm12,zmm6,1 + vpxorq ymm6,ymm6,ymm12 + vextracti32x4 xmm12,ymm6,1 + vpxorq xmm6,xmm6,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm6,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[512+r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[576+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[640+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[704+r11*1+r10],zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqa64 ZMMWORD[1024+rsp],zmm0 + vmovdqa64 ZMMWORD[1088+rsp],zmm3 + vmovdqa64 ZMMWORD[1152+rsp],zmm4 + vmovdqa64 ZMMWORD[1216+rsp],zmm5 + vmovdqa64 zmm14,zmm6 + + add r11,768 + sub r13,768 + cmp r13,768 + jae NEAR $L$_encrypt_big_nblocks_334 + +$L$_no_more_big_nblocks_334: + + cmp r13,512 + jae NEAR $L$_encrypt_32_blocks_334 + + cmp r13,256 + jae NEAR $L$_encrypt_16_blocks_334 +$L$_encrypt_0_blocks_ghash_32_334: + mov r10d,r13d + and r10d,~15 + mov ebx,256 + sub ebx,r10d + vmovdqa64 zmm13,ZMMWORD[768+rsp] + vpxorq zmm13,zmm13,zmm14 + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[832+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpxorq zmm26,zmm4,zmm10 + vpxorq zmm24,zmm0,zmm6 + vpxorq zmm25,zmm3,zmm7 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[896+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[960+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + add ebx,256 + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_343 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_343 + jb NEAR $L$_last_num_blocks_is_7_1_343 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_343 + jb NEAR $L$_last_num_blocks_is_11_9_343 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_343 + ja NEAR $L$_last_num_blocks_is_16_343 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_343 + jmp NEAR $L$_last_num_blocks_is_13_343 + +$L$_last_num_blocks_is_11_9_343: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_343 + ja NEAR $L$_last_num_blocks_is_11_343 + jmp NEAR $L$_last_num_blocks_is_9_343 + +$L$_last_num_blocks_is_7_1_343: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_343 + jb NEAR $L$_last_num_blocks_is_3_1_343 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_343 + je NEAR $L$_last_num_blocks_is_6_343 + jmp NEAR $L$_last_num_blocks_is_5_343 + +$L$_last_num_blocks_is_3_1_343: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_343 + je NEAR $L$_last_num_blocks_is_2_343 +$L$_last_num_blocks_is_1_343: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_344 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_344 + +$L$_16_blocks_overflow_344: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_344: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc xmm0,xmm0,xmm31 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb xmm17,xmm0,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_345 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_345 +$L$_small_initial_partial_block_345: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm0,XMMWORD[POLY2] + + + vpclmulqdq xmm3,xmm0,xmm25,0x01 + vpslldq xmm3,xmm3,8 + vpxorq xmm3,xmm25,xmm3 + + + vpclmulqdq xmm4,xmm0,xmm3,0x00 + vpsrldq xmm4,xmm4,4 + vpclmulqdq xmm14,xmm0,xmm3,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm4,xmm24,0x96 + + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_345 +$L$_small_initial_compute_done_345: +$L$_after_reduction_345: + jmp NEAR $L$_last_blocks_done_343 +$L$_last_num_blocks_is_2_343: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_346 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_346 + +$L$_16_blocks_overflow_346: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_346: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc ymm0,ymm0,ymm31 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb ymm17,ymm0,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_347 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_347 +$L$_small_initial_partial_block_347: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_347: + + or r13,r13 + je NEAR $L$_after_reduction_347 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_347: + jmp NEAR $L$_last_blocks_done_343 +$L$_last_num_blocks_is_3_343: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_348 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_348 + +$L$_16_blocks_overflow_348: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_348: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_349 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_349 +$L$_small_initial_partial_block_349: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_349: + + or r13,r13 + je NEAR $L$_after_reduction_349 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_349: + jmp NEAR $L$_last_blocks_done_343 +$L$_last_num_blocks_is_4_343: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_350 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_350 + +$L$_16_blocks_overflow_350: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_350: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_351 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_351 +$L$_small_initial_partial_block_351: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_351: + + or r13,r13 + je NEAR $L$_after_reduction_351 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_351: + jmp NEAR $L$_last_blocks_done_343 +$L$_last_num_blocks_is_5_343: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_352 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_352 + +$L$_16_blocks_overflow_352: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_352: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb xmm19,xmm3,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_353 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_353 +$L$_small_initial_partial_block_353: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_353: + + or r13,r13 + je NEAR $L$_after_reduction_353 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_353: + jmp NEAR $L$_last_blocks_done_343 +$L$_last_num_blocks_is_6_343: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_354 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_354 + +$L$_16_blocks_overflow_354: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_354: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb ymm19,ymm3,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_355 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_355 +$L$_small_initial_partial_block_355: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_355: + + or r13,r13 + je NEAR $L$_after_reduction_355 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_355: + jmp NEAR $L$_last_blocks_done_343 +$L$_last_num_blocks_is_7_343: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_356 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_356 + +$L$_16_blocks_overflow_356: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_356: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_357 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_357 +$L$_small_initial_partial_block_357: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_357: + + or r13,r13 + je NEAR $L$_after_reduction_357 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_357: + jmp NEAR $L$_last_blocks_done_343 +$L$_last_num_blocks_is_8_343: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_358 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_358 + +$L$_16_blocks_overflow_358: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_358: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_359 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_359 +$L$_small_initial_partial_block_359: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_359: + + or r13,r13 + je NEAR $L$_after_reduction_359 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_359: + jmp NEAR $L$_last_blocks_done_343 +$L$_last_num_blocks_is_9_343: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_360 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_360 + +$L$_16_blocks_overflow_360: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_360: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb xmm20,xmm4,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_361 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_361 +$L$_small_initial_partial_block_361: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_361: + + or r13,r13 + je NEAR $L$_after_reduction_361 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_361: + jmp NEAR $L$_last_blocks_done_343 +$L$_last_num_blocks_is_10_343: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_362 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_362 + +$L$_16_blocks_overflow_362: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_362: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb ymm20,ymm4,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_363 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_363 +$L$_small_initial_partial_block_363: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_363: + + or r13,r13 + je NEAR $L$_after_reduction_363 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_363: + jmp NEAR $L$_last_blocks_done_343 +$L$_last_num_blocks_is_11_343: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_364 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_364 + +$L$_16_blocks_overflow_364: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_364: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_365 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_365 +$L$_small_initial_partial_block_365: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_365: + + or r13,r13 + je NEAR $L$_after_reduction_365 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_365: + jmp NEAR $L$_last_blocks_done_343 +$L$_last_num_blocks_is_12_343: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_366 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_366 + +$L$_16_blocks_overflow_366: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_366: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_367 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_367 +$L$_small_initial_partial_block_367: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_367: + + or r13,r13 + je NEAR $L$_after_reduction_367 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_367: + jmp NEAR $L$_last_blocks_done_343 +$L$_last_num_blocks_is_13_343: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_368 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_368 + +$L$_16_blocks_overflow_368: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_368: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb xmm21,xmm5,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_369 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_369 +$L$_small_initial_partial_block_369: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_369: + + or r13,r13 + je NEAR $L$_after_reduction_369 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_369: + jmp NEAR $L$_last_blocks_done_343 +$L$_last_num_blocks_is_14_343: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_370 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_370 + +$L$_16_blocks_overflow_370: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_370: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb ymm21,ymm5,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_371 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_371 +$L$_small_initial_partial_block_371: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_371: + + or r13,r13 + je NEAR $L$_after_reduction_371 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_371: + jmp NEAR $L$_last_blocks_done_343 +$L$_last_num_blocks_is_15_343: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_372 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_372 + +$L$_16_blocks_overflow_372: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_372: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_373 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_373 +$L$_small_initial_partial_block_373: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_373: + + or r13,r13 + je NEAR $L$_after_reduction_373 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_373: + jmp NEAR $L$_last_blocks_done_343 +$L$_last_num_blocks_is_16_343: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_374 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_374 + +$L$_16_blocks_overflow_374: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_374: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_375: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_375: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_375: + jmp NEAR $L$_last_blocks_done_343 +$L$_last_num_blocks_is_0_343: + vmovdqa64 zmm13,ZMMWORD[1024+rsp] + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1088+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1152+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1216+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_343: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_334 +$L$_encrypt_32_blocks_334: + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_376 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_376 +$L$_16_blocks_overflow_376: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_376: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[192+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm26,zmm10,zmm15 + vpxorq zmm24,zmm6,zmm12 + vpxorq zmm25,zmm7,zmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqa64 ZMMWORD[1280+rsp],zmm0 + vmovdqa64 ZMMWORD[1344+rsp],zmm3 + vmovdqa64 ZMMWORD[1408+rsp],zmm4 + vmovdqa64 ZMMWORD[1472+rsp],zmm5 + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_377 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_377 +$L$_16_blocks_overflow_377: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_377: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[256+rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[320+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[384+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[448+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[256+r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[320+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[384+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[448+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vpternlogq zmm24,zmm6,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[256+r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[320+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[384+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[448+r11*1+r10],zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqa64 ZMMWORD[768+rsp],zmm0 + vmovdqa64 ZMMWORD[832+rsp],zmm3 + vmovdqa64 ZMMWORD[896+rsp],zmm4 + vmovdqa64 ZMMWORD[960+rsp],zmm5 + vmovdqa64 zmm13,ZMMWORD[1280+rsp] + vmovdqu64 zmm12,ZMMWORD[512+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1344+rsp] + vmovdqu64 zmm12,ZMMWORD[576+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1408+rsp] + vmovdqu64 zmm12,ZMMWORD[640+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1472+rsp] + vmovdqu64 zmm12,ZMMWORD[704+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + + sub r13,512 + add r11,512 + mov r10d,r13d + and r10d,~15 + mov ebx,512 + sub ebx,r10d + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_378 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_378 + jb NEAR $L$_last_num_blocks_is_7_1_378 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_378 + jb NEAR $L$_last_num_blocks_is_11_9_378 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_378 + ja NEAR $L$_last_num_blocks_is_16_378 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_378 + jmp NEAR $L$_last_num_blocks_is_13_378 + +$L$_last_num_blocks_is_11_9_378: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_378 + ja NEAR $L$_last_num_blocks_is_11_378 + jmp NEAR $L$_last_num_blocks_is_9_378 + +$L$_last_num_blocks_is_7_1_378: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_378 + jb NEAR $L$_last_num_blocks_is_3_1_378 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_378 + je NEAR $L$_last_num_blocks_is_6_378 + jmp NEAR $L$_last_num_blocks_is_5_378 + +$L$_last_num_blocks_is_3_1_378: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_378 + je NEAR $L$_last_num_blocks_is_2_378 +$L$_last_num_blocks_is_1_378: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_379 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_379 + +$L$_16_blocks_overflow_379: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_379: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc xmm0,xmm0,xmm31 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb xmm17,xmm0,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_380 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_380 +$L$_small_initial_partial_block_380: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm0,XMMWORD[POLY2] + + + vpclmulqdq xmm3,xmm0,xmm25,0x01 + vpslldq xmm3,xmm3,8 + vpxorq xmm3,xmm25,xmm3 + + + vpclmulqdq xmm4,xmm0,xmm3,0x00 + vpsrldq xmm4,xmm4,4 + vpclmulqdq xmm14,xmm0,xmm3,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm4,xmm24,0x96 + + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_380 +$L$_small_initial_compute_done_380: +$L$_after_reduction_380: + jmp NEAR $L$_last_blocks_done_378 +$L$_last_num_blocks_is_2_378: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_381 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_381 + +$L$_16_blocks_overflow_381: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_381: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc ymm0,ymm0,ymm31 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb ymm17,ymm0,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_382 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_382 +$L$_small_initial_partial_block_382: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_382: + + or r13,r13 + je NEAR $L$_after_reduction_382 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_382: + jmp NEAR $L$_last_blocks_done_378 +$L$_last_num_blocks_is_3_378: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_383 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_383 + +$L$_16_blocks_overflow_383: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_383: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_384 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_384 +$L$_small_initial_partial_block_384: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_384: + + or r13,r13 + je NEAR $L$_after_reduction_384 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_384: + jmp NEAR $L$_last_blocks_done_378 +$L$_last_num_blocks_is_4_378: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_385 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_385 + +$L$_16_blocks_overflow_385: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_385: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_386 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_386 +$L$_small_initial_partial_block_386: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_386: + + or r13,r13 + je NEAR $L$_after_reduction_386 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_386: + jmp NEAR $L$_last_blocks_done_378 +$L$_last_num_blocks_is_5_378: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_387 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_387 + +$L$_16_blocks_overflow_387: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_387: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb xmm19,xmm3,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_388 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_388 +$L$_small_initial_partial_block_388: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_388: + + or r13,r13 + je NEAR $L$_after_reduction_388 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_388: + jmp NEAR $L$_last_blocks_done_378 +$L$_last_num_blocks_is_6_378: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_389 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_389 + +$L$_16_blocks_overflow_389: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_389: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb ymm19,ymm3,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_390 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_390 +$L$_small_initial_partial_block_390: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_390: + + or r13,r13 + je NEAR $L$_after_reduction_390 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_390: + jmp NEAR $L$_last_blocks_done_378 +$L$_last_num_blocks_is_7_378: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_391 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_391 + +$L$_16_blocks_overflow_391: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_391: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_392 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_392 +$L$_small_initial_partial_block_392: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_392: + + or r13,r13 + je NEAR $L$_after_reduction_392 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_392: + jmp NEAR $L$_last_blocks_done_378 +$L$_last_num_blocks_is_8_378: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_393 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_393 + +$L$_16_blocks_overflow_393: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_393: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_394 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_394 +$L$_small_initial_partial_block_394: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_394: + + or r13,r13 + je NEAR $L$_after_reduction_394 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_394: + jmp NEAR $L$_last_blocks_done_378 +$L$_last_num_blocks_is_9_378: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_395 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_395 + +$L$_16_blocks_overflow_395: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_395: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb xmm20,xmm4,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_396 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_396 +$L$_small_initial_partial_block_396: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_396: + + or r13,r13 + je NEAR $L$_after_reduction_396 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_396: + jmp NEAR $L$_last_blocks_done_378 +$L$_last_num_blocks_is_10_378: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_397 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_397 + +$L$_16_blocks_overflow_397: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_397: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb ymm20,ymm4,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_398 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_398 +$L$_small_initial_partial_block_398: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_398: + + or r13,r13 + je NEAR $L$_after_reduction_398 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_398: + jmp NEAR $L$_last_blocks_done_378 +$L$_last_num_blocks_is_11_378: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_399 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_399 + +$L$_16_blocks_overflow_399: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_399: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_400 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_400 +$L$_small_initial_partial_block_400: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_400: + + or r13,r13 + je NEAR $L$_after_reduction_400 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_400: + jmp NEAR $L$_last_blocks_done_378 +$L$_last_num_blocks_is_12_378: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_401 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_401 + +$L$_16_blocks_overflow_401: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_401: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_402 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_402 +$L$_small_initial_partial_block_402: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_402: + + or r13,r13 + je NEAR $L$_after_reduction_402 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_402: + jmp NEAR $L$_last_blocks_done_378 +$L$_last_num_blocks_is_13_378: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_403 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_403 + +$L$_16_blocks_overflow_403: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_403: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb xmm21,xmm5,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_404 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_404 +$L$_small_initial_partial_block_404: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_404: + + or r13,r13 + je NEAR $L$_after_reduction_404 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_404: + jmp NEAR $L$_last_blocks_done_378 +$L$_last_num_blocks_is_14_378: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_405 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_405 + +$L$_16_blocks_overflow_405: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_405: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb ymm21,ymm5,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_406 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_406 +$L$_small_initial_partial_block_406: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_406: + + or r13,r13 + je NEAR $L$_after_reduction_406 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_406: + jmp NEAR $L$_last_blocks_done_378 +$L$_last_num_blocks_is_15_378: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_407 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_407 + +$L$_16_blocks_overflow_407: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_407: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_408 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_408 +$L$_small_initial_partial_block_408: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_408: + + or r13,r13 + je NEAR $L$_after_reduction_408 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_408: + jmp NEAR $L$_last_blocks_done_378 +$L$_last_num_blocks_is_16_378: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_409 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_409 + +$L$_16_blocks_overflow_409: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_409: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_410: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_410: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_410: + jmp NEAR $L$_last_blocks_done_378 +$L$_last_num_blocks_is_0_378: + vmovdqa64 zmm13,ZMMWORD[768+rsp] + vpxorq zmm13,zmm13,zmm14 + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[832+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpxorq zmm26,zmm4,zmm10 + vpxorq zmm24,zmm0,zmm6 + vpxorq zmm25,zmm3,zmm7 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[896+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[960+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_378: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_334 +$L$_encrypt_16_blocks_334: + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_411 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_411 +$L$_16_blocks_overflow_411: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_411: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[192+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm26,zmm10,zmm15 + vpxorq zmm24,zmm6,zmm12 + vpxorq zmm25,zmm7,zmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqa64 ZMMWORD[1280+rsp],zmm0 + vmovdqa64 ZMMWORD[1344+rsp],zmm3 + vmovdqa64 ZMMWORD[1408+rsp],zmm4 + vmovdqa64 ZMMWORD[1472+rsp],zmm5 + vmovdqa64 zmm13,ZMMWORD[1024+rsp] + vmovdqu64 zmm12,ZMMWORD[256+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1088+rsp] + vmovdqu64 zmm12,ZMMWORD[320+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1152+rsp] + vmovdqu64 zmm12,ZMMWORD[384+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1216+rsp] + vmovdqu64 zmm12,ZMMWORD[448+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + sub r13,256 + add r11,256 + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_412 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_412 + jb NEAR $L$_last_num_blocks_is_7_1_412 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_412 + jb NEAR $L$_last_num_blocks_is_11_9_412 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_412 + ja NEAR $L$_last_num_blocks_is_16_412 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_412 + jmp NEAR $L$_last_num_blocks_is_13_412 + +$L$_last_num_blocks_is_11_9_412: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_412 + ja NEAR $L$_last_num_blocks_is_11_412 + jmp NEAR $L$_last_num_blocks_is_9_412 + +$L$_last_num_blocks_is_7_1_412: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_412 + jb NEAR $L$_last_num_blocks_is_3_1_412 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_412 + je NEAR $L$_last_num_blocks_is_6_412 + jmp NEAR $L$_last_num_blocks_is_5_412 + +$L$_last_num_blocks_is_3_1_412: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_412 + je NEAR $L$_last_num_blocks_is_2_412 +$L$_last_num_blocks_is_1_412: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_413 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_413 + +$L$_16_blocks_overflow_413: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_413: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc xmm0,xmm0,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc xmm0,xmm0,xmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb xmm17,xmm0,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_414 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_414 +$L$_small_initial_partial_block_414: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_414 +$L$_small_initial_compute_done_414: +$L$_after_reduction_414: + jmp NEAR $L$_last_blocks_done_412 +$L$_last_num_blocks_is_2_412: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_415 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_415 + +$L$_16_blocks_overflow_415: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_415: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc ymm0,ymm0,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc ymm0,ymm0,ymm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb ymm17,ymm0,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_416 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_416 +$L$_small_initial_partial_block_416: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_416: + + or r13,r13 + je NEAR $L$_after_reduction_416 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_416: + jmp NEAR $L$_last_blocks_done_412 +$L$_last_num_blocks_is_3_412: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_417 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_417 + +$L$_16_blocks_overflow_417: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_417: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_418 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_418 +$L$_small_initial_partial_block_418: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_418: + + or r13,r13 + je NEAR $L$_after_reduction_418 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_418: + jmp NEAR $L$_last_blocks_done_412 +$L$_last_num_blocks_is_4_412: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_419 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_419 + +$L$_16_blocks_overflow_419: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_419: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_420 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_420 +$L$_small_initial_partial_block_420: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_420: + + or r13,r13 + je NEAR $L$_after_reduction_420 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_420: + jmp NEAR $L$_last_blocks_done_412 +$L$_last_num_blocks_is_5_412: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_421 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_421 + +$L$_16_blocks_overflow_421: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_421: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb xmm19,xmm3,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_422 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_422 +$L$_small_initial_partial_block_422: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_422: + + or r13,r13 + je NEAR $L$_after_reduction_422 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_422: + jmp NEAR $L$_last_blocks_done_412 +$L$_last_num_blocks_is_6_412: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_423 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_423 + +$L$_16_blocks_overflow_423: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_423: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb ymm19,ymm3,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_424 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_424 +$L$_small_initial_partial_block_424: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_424: + + or r13,r13 + je NEAR $L$_after_reduction_424 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_424: + jmp NEAR $L$_last_blocks_done_412 +$L$_last_num_blocks_is_7_412: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_425 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_425 + +$L$_16_blocks_overflow_425: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_425: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_426 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_426 +$L$_small_initial_partial_block_426: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_426: + + or r13,r13 + je NEAR $L$_after_reduction_426 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_426: + jmp NEAR $L$_last_blocks_done_412 +$L$_last_num_blocks_is_8_412: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_427 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_427 + +$L$_16_blocks_overflow_427: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_427: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_428 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_428 +$L$_small_initial_partial_block_428: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_428: + + or r13,r13 + je NEAR $L$_after_reduction_428 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_428: + jmp NEAR $L$_last_blocks_done_412 +$L$_last_num_blocks_is_9_412: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_429 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_429 + +$L$_16_blocks_overflow_429: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_429: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb xmm20,xmm4,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_430 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_430 +$L$_small_initial_partial_block_430: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_430: + + or r13,r13 + je NEAR $L$_after_reduction_430 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_430: + jmp NEAR $L$_last_blocks_done_412 +$L$_last_num_blocks_is_10_412: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_431 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_431 + +$L$_16_blocks_overflow_431: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_431: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb ymm20,ymm4,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_432 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_432 +$L$_small_initial_partial_block_432: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_432: + + or r13,r13 + je NEAR $L$_after_reduction_432 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_432: + jmp NEAR $L$_last_blocks_done_412 +$L$_last_num_blocks_is_11_412: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_433 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_433 + +$L$_16_blocks_overflow_433: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_433: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_434 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_434 +$L$_small_initial_partial_block_434: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_434: + + or r13,r13 + je NEAR $L$_after_reduction_434 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_434: + jmp NEAR $L$_last_blocks_done_412 +$L$_last_num_blocks_is_12_412: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_435 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_435 + +$L$_16_blocks_overflow_435: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_435: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_436 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_436 +$L$_small_initial_partial_block_436: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_436: + + or r13,r13 + je NEAR $L$_after_reduction_436 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_436: + jmp NEAR $L$_last_blocks_done_412 +$L$_last_num_blocks_is_13_412: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_437 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_437 + +$L$_16_blocks_overflow_437: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_437: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb xmm21,xmm5,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_438 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_438 +$L$_small_initial_partial_block_438: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_438: + + or r13,r13 + je NEAR $L$_after_reduction_438 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_438: + jmp NEAR $L$_last_blocks_done_412 +$L$_last_num_blocks_is_14_412: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_439 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_439 + +$L$_16_blocks_overflow_439: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_439: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb ymm21,ymm5,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_440 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_440 +$L$_small_initial_partial_block_440: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_440: + + or r13,r13 + je NEAR $L$_after_reduction_440 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_440: + jmp NEAR $L$_last_blocks_done_412 +$L$_last_num_blocks_is_15_412: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_441 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_441 + +$L$_16_blocks_overflow_441: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_441: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_442 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_442 +$L$_small_initial_partial_block_442: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_442: + + or r13,r13 + je NEAR $L$_after_reduction_442 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_442: + jmp NEAR $L$_last_blocks_done_412 +$L$_last_num_blocks_is_16_412: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_443 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_443 + +$L$_16_blocks_overflow_443: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_443: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_444: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_444: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_444: + jmp NEAR $L$_last_blocks_done_412 +$L$_last_num_blocks_is_0_412: + vmovdqa64 zmm13,ZMMWORD[1280+rsp] + vmovdqu64 zmm12,ZMMWORD[512+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1344+rsp] + vmovdqu64 zmm12,ZMMWORD[576+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1408+rsp] + vmovdqu64 zmm12,ZMMWORD[640+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1472+rsp] + vmovdqu64 zmm12,ZMMWORD[704+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_412: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_334 + +$L$_message_below_32_blocks_334: + + + sub r13,256 + add r11,256 + mov r10d,r13d + test r14,r14 + jnz NEAR $L$_skip_hkeys_precomputation_445 + vmovdqu64 zmm3,ZMMWORD[640+rsp] + + + vshufi64x2 zmm3,zmm3,zmm3,0x00 + + vmovdqu64 zmm4,ZMMWORD[576+rsp] + vmovdqu64 zmm5,ZMMWORD[512+rsp] + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[448+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[384+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[320+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[256+rsp],zmm5 +$L$_skip_hkeys_precomputation_445: + mov r14,1 + and r10d,~15 + mov ebx,512 + sub ebx,r10d + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_446 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_446 + jb NEAR $L$_last_num_blocks_is_7_1_446 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_446 + jb NEAR $L$_last_num_blocks_is_11_9_446 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_446 + ja NEAR $L$_last_num_blocks_is_16_446 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_446 + jmp NEAR $L$_last_num_blocks_is_13_446 + +$L$_last_num_blocks_is_11_9_446: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_446 + ja NEAR $L$_last_num_blocks_is_11_446 + jmp NEAR $L$_last_num_blocks_is_9_446 + +$L$_last_num_blocks_is_7_1_446: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_446 + jb NEAR $L$_last_num_blocks_is_3_1_446 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_446 + je NEAR $L$_last_num_blocks_is_6_446 + jmp NEAR $L$_last_num_blocks_is_5_446 + +$L$_last_num_blocks_is_3_1_446: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_446 + je NEAR $L$_last_num_blocks_is_2_446 +$L$_last_num_blocks_is_1_446: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_447 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_447 + +$L$_16_blocks_overflow_447: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_447: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc xmm0,xmm0,xmm31 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb xmm17,xmm0,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_448 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_448 +$L$_small_initial_partial_block_448: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm0,XMMWORD[POLY2] + + + vpclmulqdq xmm3,xmm0,xmm25,0x01 + vpslldq xmm3,xmm3,8 + vpxorq xmm3,xmm25,xmm3 + + + vpclmulqdq xmm4,xmm0,xmm3,0x00 + vpsrldq xmm4,xmm4,4 + vpclmulqdq xmm14,xmm0,xmm3,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm4,xmm24,0x96 + + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_448 +$L$_small_initial_compute_done_448: +$L$_after_reduction_448: + jmp NEAR $L$_last_blocks_done_446 +$L$_last_num_blocks_is_2_446: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_449 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_449 + +$L$_16_blocks_overflow_449: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_449: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc ymm0,ymm0,ymm31 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb ymm17,ymm0,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_450 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_450 +$L$_small_initial_partial_block_450: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_450: + + or r13,r13 + je NEAR $L$_after_reduction_450 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_450: + jmp NEAR $L$_last_blocks_done_446 +$L$_last_num_blocks_is_3_446: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_451 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_451 + +$L$_16_blocks_overflow_451: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_451: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_452 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_452 +$L$_small_initial_partial_block_452: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_452: + + or r13,r13 + je NEAR $L$_after_reduction_452 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_452: + jmp NEAR $L$_last_blocks_done_446 +$L$_last_num_blocks_is_4_446: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_453 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_453 + +$L$_16_blocks_overflow_453: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_453: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm17,zmm0,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_454 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_454 +$L$_small_initial_partial_block_454: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_454: + + or r13,r13 + je NEAR $L$_after_reduction_454 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_454: + jmp NEAR $L$_last_blocks_done_446 +$L$_last_num_blocks_is_5_446: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_455 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_455 + +$L$_16_blocks_overflow_455: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_455: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb xmm19,xmm3,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_456 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_456 +$L$_small_initial_partial_block_456: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_456: + + or r13,r13 + je NEAR $L$_after_reduction_456 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_456: + jmp NEAR $L$_last_blocks_done_446 +$L$_last_num_blocks_is_6_446: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_457 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_457 + +$L$_16_blocks_overflow_457: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_457: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb ymm19,ymm3,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_458 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_458 +$L$_small_initial_partial_block_458: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_458: + + or r13,r13 + je NEAR $L$_after_reduction_458 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_458: + jmp NEAR $L$_last_blocks_done_446 +$L$_last_num_blocks_is_7_446: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_459 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_459 + +$L$_16_blocks_overflow_459: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_459: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_460 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_460 +$L$_small_initial_partial_block_460: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_460: + + or r13,r13 + je NEAR $L$_after_reduction_460 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_460: + jmp NEAR $L$_last_blocks_done_446 +$L$_last_num_blocks_is_8_446: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_461 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_461 + +$L$_16_blocks_overflow_461: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_461: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_462 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_462 +$L$_small_initial_partial_block_462: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_462: + + or r13,r13 + je NEAR $L$_after_reduction_462 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_462: + jmp NEAR $L$_last_blocks_done_446 +$L$_last_num_blocks_is_9_446: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_463 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_463 + +$L$_16_blocks_overflow_463: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_463: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb xmm20,xmm4,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_464 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_464 +$L$_small_initial_partial_block_464: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_464: + + or r13,r13 + je NEAR $L$_after_reduction_464 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_464: + jmp NEAR $L$_last_blocks_done_446 +$L$_last_num_blocks_is_10_446: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_465 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_465 + +$L$_16_blocks_overflow_465: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_465: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb ymm20,ymm4,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_466 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_466 +$L$_small_initial_partial_block_466: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_466: + + or r13,r13 + je NEAR $L$_after_reduction_466 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_466: + jmp NEAR $L$_last_blocks_done_446 +$L$_last_num_blocks_is_11_446: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_467 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_467 + +$L$_16_blocks_overflow_467: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_467: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_468 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_468 +$L$_small_initial_partial_block_468: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_468: + + or r13,r13 + je NEAR $L$_after_reduction_468 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_468: + jmp NEAR $L$_last_blocks_done_446 +$L$_last_num_blocks_is_12_446: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_469 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_469 + +$L$_16_blocks_overflow_469: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_469: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_470 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_470 +$L$_small_initial_partial_block_470: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_470: + + or r13,r13 + je NEAR $L$_after_reduction_470 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_470: + jmp NEAR $L$_last_blocks_done_446 +$L$_last_num_blocks_is_13_446: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_471 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_471 + +$L$_16_blocks_overflow_471: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_471: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb xmm21,xmm5,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_472 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_472 +$L$_small_initial_partial_block_472: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_472: + + or r13,r13 + je NEAR $L$_after_reduction_472 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_472: + jmp NEAR $L$_last_blocks_done_446 +$L$_last_num_blocks_is_14_446: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_473 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_473 + +$L$_16_blocks_overflow_473: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_473: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb ymm21,ymm5,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_474 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_474 +$L$_small_initial_partial_block_474: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_474: + + or r13,r13 + je NEAR $L$_after_reduction_474 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_474: + jmp NEAR $L$_last_blocks_done_446 +$L$_last_num_blocks_is_15_446: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_475 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_475 + +$L$_16_blocks_overflow_475: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_475: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_476 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_476 +$L$_small_initial_partial_block_476: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_476: + + or r13,r13 + je NEAR $L$_after_reduction_476 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_476: + jmp NEAR $L$_last_blocks_done_446 +$L$_last_num_blocks_is_16_446: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_477 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_477 + +$L$_16_blocks_overflow_477: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_477: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm17,zmm0,zmm29 + vpshufb zmm19,zmm3,zmm29 + vpshufb zmm20,zmm4,zmm29 + vpshufb zmm21,zmm5,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_478: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_478: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_478: + jmp NEAR $L$_last_blocks_done_446 +$L$_last_num_blocks_is_0_446: + vmovdqa64 zmm13,ZMMWORD[768+rsp] + vpxorq zmm13,zmm13,zmm14 + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[832+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpxorq zmm26,zmm4,zmm10 + vpxorq zmm24,zmm0,zmm6 + vpxorq zmm25,zmm3,zmm7 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[896+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[960+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_446: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_334 + +$L$_message_below_equal_16_blocks_334: + + + mov r12d,r13d + add r12d,15 + shr r12d,4 + cmp r12,8 + je NEAR $L$_small_initial_num_blocks_is_8_479 + jl NEAR $L$_small_initial_num_blocks_is_7_1_479 + + + cmp r12,12 + je NEAR $L$_small_initial_num_blocks_is_12_479 + jl NEAR $L$_small_initial_num_blocks_is_11_9_479 + + + cmp r12,16 + je NEAR $L$_small_initial_num_blocks_is_16_479 + cmp r12,15 + je NEAR $L$_small_initial_num_blocks_is_15_479 + cmp r12,14 + je NEAR $L$_small_initial_num_blocks_is_14_479 + jmp NEAR $L$_small_initial_num_blocks_is_13_479 + +$L$_small_initial_num_blocks_is_11_9_479: + + cmp r12,11 + je NEAR $L$_small_initial_num_blocks_is_11_479 + cmp r12,10 + je NEAR $L$_small_initial_num_blocks_is_10_479 + jmp NEAR $L$_small_initial_num_blocks_is_9_479 + +$L$_small_initial_num_blocks_is_7_1_479: + cmp r12,4 + je NEAR $L$_small_initial_num_blocks_is_4_479 + jl NEAR $L$_small_initial_num_blocks_is_3_1_479 + + cmp r12,7 + je NEAR $L$_small_initial_num_blocks_is_7_479 + cmp r12,6 + je NEAR $L$_small_initial_num_blocks_is_6_479 + jmp NEAR $L$_small_initial_num_blocks_is_5_479 + +$L$_small_initial_num_blocks_is_3_1_479: + + cmp r12,3 + je NEAR $L$_small_initial_num_blocks_is_3_479 + cmp r12,2 + je NEAR $L$_small_initial_num_blocks_is_2_479 + + + + + +$L$_small_initial_num_blocks_is_1_479: + vmovdqa64 xmm29,XMMWORD[SHUF_MASK] + vpaddd xmm0,xmm2,XMMWORD[ONE] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,0 + vpshufb xmm0,xmm0,xmm29 + vmovdqu8 xmm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast xmm0,xmm0,xmm15 + vpxorq xmm0,xmm0,xmm6 + vextracti32x4 xmm12,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb xmm6,xmm0,xmm29 + vextracti32x4 xmm13,zmm6,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_480 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm6,xmm20,0x01 + vpclmulqdq xmm5,xmm6,xmm20,0x10 + vpclmulqdq xmm0,xmm6,xmm20,0x11 + vpclmulqdq xmm3,xmm6,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_480 +$L$_small_initial_partial_block_480: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm13 + + jmp NEAR $L$_after_reduction_480 +$L$_small_initial_compute_done_480: +$L$_after_reduction_480: + jmp NEAR $L$_small_initial_blocks_encrypted_479 +$L$_small_initial_num_blocks_is_2_479: + vmovdqa64 ymm29,YMMWORD[SHUF_MASK] + vshufi64x2 ymm0,ymm2,ymm2,0 + vpaddd ymm0,ymm0,YMMWORD[ddq_add_1234] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,1 + vpshufb ymm0,ymm0,ymm29 + vmovdqu8 ymm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast ymm0,ymm0,ymm15 + vpxorq ymm0,ymm0,ymm6 + vextracti32x4 xmm12,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb ymm6,ymm0,ymm29 + vextracti32x4 xmm13,zmm6,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_481 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm6,ymm20,0x01 + vpclmulqdq ymm5,ymm6,ymm20,0x10 + vpclmulqdq ymm0,ymm6,ymm20,0x11 + vpclmulqdq ymm3,ymm6,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_481 +$L$_small_initial_partial_block_481: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm6,xmm20,0x01 + vpclmulqdq xmm5,xmm6,xmm20,0x10 + vpclmulqdq xmm0,xmm6,xmm20,0x11 + vpclmulqdq xmm3,xmm6,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_481: + + or r13,r13 + je NEAR $L$_after_reduction_481 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_481: + jmp NEAR $L$_small_initial_blocks_encrypted_479 +$L$_small_initial_num_blocks_is_3_479: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,2 + vpshufb zmm0,zmm0,zmm29 + vmovdqu8 zmm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vpxorq zmm0,zmm0,zmm6 + vextracti32x4 xmm12,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm6,zmm0,zmm29 + vextracti32x4 xmm13,zmm6,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_482 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_482 +$L$_small_initial_partial_block_482: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm6,ymm20,0x01 + vpclmulqdq ymm5,ymm6,ymm20,0x10 + vpclmulqdq ymm0,ymm6,ymm20,0x11 + vpclmulqdq ymm3,ymm6,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_482: + + or r13,r13 + je NEAR $L$_after_reduction_482 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_482: + jmp NEAR $L$_small_initial_blocks_encrypted_479 +$L$_small_initial_num_blocks_is_4_479: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,3 + vpshufb zmm0,zmm0,zmm29 + vmovdqu8 zmm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vpxorq zmm0,zmm0,zmm6 + vextracti32x4 xmm12,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm6,zmm0,zmm29 + vextracti32x4 xmm13,zmm6,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_483 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_483 +$L$_small_initial_partial_block_483: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_483: + + or r13,r13 + je NEAR $L$_after_reduction_483 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_483: + jmp NEAR $L$_small_initial_blocks_encrypted_479 +$L$_small_initial_num_blocks_is_5_479: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,0 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 xmm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast xmm3,xmm3,xmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq xmm3,xmm3,xmm7 + vextracti32x4 xmm12,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm0,zmm29 + vpshufb xmm7,xmm3,xmm29 + vextracti32x4 xmm13,zmm7,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_484 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm7,xmm20,0x01 + vpclmulqdq xmm5,xmm7,xmm20,0x10 + vpclmulqdq xmm0,xmm7,xmm20,0x11 + vpclmulqdq xmm3,xmm7,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_484 +$L$_small_initial_partial_block_484: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_484: + + or r13,r13 + je NEAR $L$_after_reduction_484 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_484: + jmp NEAR $L$_small_initial_blocks_encrypted_479 +$L$_small_initial_num_blocks_is_6_479: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,1 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 ymm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast ymm3,ymm3,ymm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq ymm3,ymm3,ymm7 + vextracti32x4 xmm12,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm0,zmm29 + vpshufb ymm7,ymm3,ymm29 + vextracti32x4 xmm13,zmm7,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_485 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm7,ymm20,0x01 + vpclmulqdq ymm5,ymm7,ymm20,0x10 + vpclmulqdq ymm0,ymm7,ymm20,0x11 + vpclmulqdq ymm3,ymm7,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_485 +$L$_small_initial_partial_block_485: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm7,xmm20,0x01 + vpclmulqdq xmm5,xmm7,xmm20,0x10 + vpclmulqdq xmm0,xmm7,xmm20,0x11 + vpclmulqdq xmm3,xmm7,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_485: + + or r13,r13 + je NEAR $L$_after_reduction_485 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_485: + jmp NEAR $L$_small_initial_blocks_encrypted_479 +$L$_small_initial_num_blocks_is_7_479: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,2 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vextracti32x4 xmm12,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vextracti32x4 xmm13,zmm7,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_486 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm7,zmm20,0x01 + vpclmulqdq zmm5,zmm7,zmm20,0x10 + vpclmulqdq zmm0,zmm7,zmm20,0x11 + vpclmulqdq zmm3,zmm7,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_486 +$L$_small_initial_partial_block_486: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm7,ymm20,0x01 + vpclmulqdq ymm5,ymm7,ymm20,0x10 + vpclmulqdq ymm0,ymm7,ymm20,0x11 + vpclmulqdq ymm3,ymm7,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_486: + + or r13,r13 + je NEAR $L$_after_reduction_486 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_486: + jmp NEAR $L$_small_initial_blocks_encrypted_479 +$L$_small_initial_num_blocks_is_8_479: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,3 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vextracti32x4 xmm12,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vextracti32x4 xmm13,zmm7,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_487 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_487 +$L$_small_initial_partial_block_487: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm7,zmm20,0x01 + vpclmulqdq zmm5,zmm7,zmm20,0x10 + vpclmulqdq zmm0,zmm7,zmm20,0x11 + vpclmulqdq zmm3,zmm7,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_487: + + or r13,r13 + je NEAR $L$_after_reduction_487 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_487: + jmp NEAR $L$_small_initial_blocks_encrypted_479 +$L$_small_initial_num_blocks_is_9_479: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,0 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast xmm4,xmm4,xmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq xmm4,xmm4,xmm10 + vextracti32x4 xmm12,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb xmm10,xmm4,xmm29 + vextracti32x4 xmm13,zmm10,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_488 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm10,xmm20,0x01 + vpclmulqdq xmm5,xmm10,xmm20,0x10 + vpclmulqdq xmm0,xmm10,xmm20,0x11 + vpclmulqdq xmm3,xmm10,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_488 +$L$_small_initial_partial_block_488: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_488: + + or r13,r13 + je NEAR $L$_after_reduction_488 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_488: + jmp NEAR $L$_small_initial_blocks_encrypted_479 +$L$_small_initial_num_blocks_is_10_479: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,1 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast ymm4,ymm4,ymm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq ymm4,ymm4,ymm10 + vextracti32x4 xmm12,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb ymm10,ymm4,ymm29 + vextracti32x4 xmm13,zmm10,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_489 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm10,ymm20,0x01 + vpclmulqdq ymm5,ymm10,ymm20,0x10 + vpclmulqdq ymm0,ymm10,ymm20,0x11 + vpclmulqdq ymm3,ymm10,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_489 +$L$_small_initial_partial_block_489: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm10,xmm20,0x01 + vpclmulqdq xmm5,xmm10,xmm20,0x10 + vpclmulqdq xmm0,xmm10,xmm20,0x11 + vpclmulqdq xmm3,xmm10,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_489: + + or r13,r13 + je NEAR $L$_after_reduction_489 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_489: + jmp NEAR $L$_small_initial_blocks_encrypted_479 +$L$_small_initial_num_blocks_is_11_479: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,2 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vextracti32x4 xmm12,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb zmm10,zmm4,zmm29 + vextracti32x4 xmm13,zmm10,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_490 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm10,zmm20,0x01 + vpclmulqdq zmm5,zmm10,zmm20,0x10 + vpclmulqdq zmm0,zmm10,zmm20,0x11 + vpclmulqdq zmm3,zmm10,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_490 +$L$_small_initial_partial_block_490: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm10,ymm20,0x01 + vpclmulqdq ymm5,ymm10,ymm20,0x10 + vpclmulqdq ymm0,ymm10,ymm20,0x11 + vpclmulqdq ymm3,ymm10,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_490: + + or r13,r13 + je NEAR $L$_after_reduction_490 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_490: + jmp NEAR $L$_small_initial_blocks_encrypted_479 +$L$_small_initial_num_blocks_is_12_479: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,3 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vextracti32x4 xmm12,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb zmm10,zmm4,zmm29 + vextracti32x4 xmm13,zmm10,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_491 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_491 +$L$_small_initial_partial_block_491: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm10,zmm20,0x01 + vpclmulqdq zmm5,zmm10,zmm20,0x10 + vpclmulqdq zmm0,zmm10,zmm20,0x11 + vpclmulqdq zmm3,zmm10,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_491: + + or r13,r13 + je NEAR $L$_after_reduction_491 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_491: + jmp NEAR $L$_small_initial_blocks_encrypted_479 +$L$_small_initial_num_blocks_is_13_479: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,0 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast xmm5,xmm5,xmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq xmm5,xmm5,xmm11 + vextracti32x4 xmm12,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb zmm10,zmm4,zmm29 + vpshufb xmm11,xmm5,xmm29 + vextracti32x4 xmm13,zmm11,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_492 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm11,xmm20,0x01 + vpclmulqdq xmm5,xmm11,xmm20,0x10 + vpclmulqdq xmm0,xmm11,xmm20,0x11 + vpclmulqdq xmm3,xmm11,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_492 +$L$_small_initial_partial_block_492: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_492: + + or r13,r13 + je NEAR $L$_after_reduction_492 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_492: + jmp NEAR $L$_small_initial_blocks_encrypted_479 +$L$_small_initial_num_blocks_is_14_479: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,1 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast ymm5,ymm5,ymm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq ymm5,ymm5,ymm11 + vextracti32x4 xmm12,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb zmm10,zmm4,zmm29 + vpshufb ymm11,ymm5,ymm29 + vextracti32x4 xmm13,zmm11,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_493 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm11,ymm20,0x01 + vpclmulqdq ymm5,ymm11,ymm20,0x10 + vpclmulqdq ymm0,ymm11,ymm20,0x11 + vpclmulqdq ymm3,ymm11,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_493 +$L$_small_initial_partial_block_493: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm11,xmm20,0x01 + vpclmulqdq xmm5,xmm11,xmm20,0x10 + vpclmulqdq xmm0,xmm11,xmm20,0x11 + vpclmulqdq xmm3,xmm11,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_493: + + or r13,r13 + je NEAR $L$_after_reduction_493 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_493: + jmp NEAR $L$_small_initial_blocks_encrypted_479 +$L$_small_initial_num_blocks_is_15_479: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,2 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast zmm5,zmm5,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq zmm5,zmm5,zmm11 + vextracti32x4 xmm12,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb zmm10,zmm4,zmm29 + vpshufb zmm11,zmm5,zmm29 + vextracti32x4 xmm13,zmm11,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_494 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm11,zmm20,0x01 + vpclmulqdq zmm5,zmm11,zmm20,0x10 + vpclmulqdq zmm0,zmm11,zmm20,0x11 + vpclmulqdq zmm3,zmm11,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_494 +$L$_small_initial_partial_block_494: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm11,ymm20,0x01 + vpclmulqdq ymm5,ymm11,ymm20,0x10 + vpclmulqdq ymm0,ymm11,ymm20,0x11 + vpclmulqdq ymm3,ymm11,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_494: + + or r13,r13 + je NEAR $L$_after_reduction_494 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_494: + jmp NEAR $L$_small_initial_blocks_encrypted_479 +$L$_small_initial_num_blocks_is_16_479: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,3 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast zmm5,zmm5,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq zmm5,zmm5,zmm11 + vextracti32x4 xmm12,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm0,zmm29 + vpshufb zmm7,zmm3,zmm29 + vpshufb zmm10,zmm4,zmm29 + vpshufb zmm11,zmm5,zmm29 + vextracti32x4 xmm13,zmm11,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_495: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm11,zmm20,0x01 + vpclmulqdq zmm5,zmm11,zmm20,0x10 + vpclmulqdq zmm0,zmm11,zmm20,0x11 + vpclmulqdq zmm3,zmm11,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_495: + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_495: +$L$_small_initial_blocks_encrypted_479: +$L$_ghash_done_334: + vmovdqu64 XMMWORD[rdx],xmm2 + vmovdqu64 XMMWORD[64+rdx],xmm14 +$L$_enc_dec_done_334: + jmp NEAR $L$exit_gcm_encrypt +$L$exit_gcm_encrypt: + cmp QWORD[112+rbp],256 + jbe NEAR $L$skip_hkeys_cleanup_496 + vpxor xmm0,xmm0,xmm0 + vmovdqa64 ZMMWORD[rsp],zmm0 + vmovdqa64 ZMMWORD[64+rsp],zmm0 + vmovdqa64 ZMMWORD[128+rsp],zmm0 + vmovdqa64 ZMMWORD[192+rsp],zmm0 + vmovdqa64 ZMMWORD[256+rsp],zmm0 + vmovdqa64 ZMMWORD[320+rsp],zmm0 + vmovdqa64 ZMMWORD[384+rsp],zmm0 + vmovdqa64 ZMMWORD[448+rsp],zmm0 + vmovdqa64 ZMMWORD[512+rsp],zmm0 + vmovdqa64 ZMMWORD[576+rsp],zmm0 + vmovdqa64 ZMMWORD[640+rsp],zmm0 + vmovdqa64 ZMMWORD[704+rsp],zmm0 +$L$skip_hkeys_cleanup_496: + vzeroupper + vmovdqu xmm15,XMMWORD[((-16))+rbp] + vmovdqu xmm14,XMMWORD[((-32))+rbp] + vmovdqu xmm13,XMMWORD[((-48))+rbp] + vmovdqu xmm12,XMMWORD[((-64))+rbp] + vmovdqu xmm11,XMMWORD[((-80))+rbp] + vmovdqu xmm10,XMMWORD[((-96))+rbp] + vmovdqu xmm9,XMMWORD[((-112))+rbp] + vmovdqu xmm8,XMMWORD[((-128))+rbp] + vmovdqu xmm7,XMMWORD[((-144))+rbp] + vmovdqu xmm6,XMMWORD[((-160))+rbp] + lea rsp,[8+rbp] + pop rsi + + pop rdi + + pop r15 + + pop r14 + + pop r13 + + pop r12 + + pop rbp + + pop rbx + + DB 0F3h,0C3h ;repret +$L$encrypt_seh_end: + + +global ossl_aes_gcm_decrypt_avx512 + +ALIGN 32 +ossl_aes_gcm_decrypt_avx512: + +$L$decrypt_seh_begin: +DB 243,15,30,250 + push rbx + +$L$decrypt_seh_push_rbx: + push rbp + +$L$decrypt_seh_push_rbp: + push r12 + +$L$decrypt_seh_push_r12: + push r13 + +$L$decrypt_seh_push_r13: + push r14 + +$L$decrypt_seh_push_r14: + push r15 + +$L$decrypt_seh_push_r15: + push rdi +$L$decrypt_seh_push_rdi: + push rsi +$L$decrypt_seh_push_rsi: + + sub rsp,168 +$L$decrypt_seh_allocstack_xmm: + + + + + + + + + + + lea rbp,[160+rsp] + +$L$decrypt_seh_setfp: + vmovdqu XMMWORD[rsp],xmm6 +$L$decrypt_seh_save_xmm6: + vmovdqu XMMWORD[16+rsp],xmm7 +$L$decrypt_seh_save_xmm7: + vmovdqu XMMWORD[32+rsp],xmm8 +$L$decrypt_seh_save_xmm8: + vmovdqu XMMWORD[48+rsp],xmm9 +$L$decrypt_seh_save_xmm9: + vmovdqu XMMWORD[64+rsp],xmm10 +$L$decrypt_seh_save_xmm10: + vmovdqu XMMWORD[80+rsp],xmm11 +$L$decrypt_seh_save_xmm11: + vmovdqu XMMWORD[96+rsp],xmm12 +$L$decrypt_seh_save_xmm12: + vmovdqu XMMWORD[112+rsp],xmm13 +$L$decrypt_seh_save_xmm13: + vmovdqu XMMWORD[128+rsp],xmm14 +$L$decrypt_seh_save_xmm14: + vmovdqu XMMWORD[144+rsp],xmm15 +$L$decrypt_seh_save_xmm15: + +$L$decrypt_seh_prolog_end: + sub rsp,1584 + and rsp,(-64) + + + mov eax,DWORD[240+rcx] + cmp eax,9 + je NEAR $L$aes_gcm_decrypt_128_avx512 + cmp eax,11 + je NEAR $L$aes_gcm_decrypt_192_avx512 + cmp eax,13 + je NEAR $L$aes_gcm_decrypt_256_avx512 + xor eax,eax + jmp NEAR $L$exit_gcm_decrypt +ALIGN 32 +$L$aes_gcm_decrypt_128_avx512: + cmp QWORD[112+rbp],0 + je NEAR $L$_enc_dec_done_497 + xor r14,r14 + vmovdqu64 xmm14,XMMWORD[64+rdx] + + mov r11,QWORD[r8] + or r11,r11 + je NEAR $L$_partial_block_done_498 + mov r10d,16 + lea r12,[byte_len_to_mask_table] + cmp QWORD[112+rbp],r10 + cmovc r10,QWORD[112+rbp] + add r12,r10 + add r12,r10 + kmovw k1,[r12] + vmovdqu8 xmm0{k1}{z},[r9] + + vmovdqu64 xmm3,XMMWORD[16+rdx] + vmovdqu64 xmm4,XMMWORD[336+rdx] + + + + lea r12,[SHIFT_MASK] + add r12,r11 + vmovdqu64 xmm5,XMMWORD[r12] + vpshufb xmm3,xmm3,xmm5 + + vmovdqa64 xmm6,xmm0 + vpxorq xmm3,xmm3,xmm0 + + + mov r13,QWORD[112+rbp] + add r13,r11 + sub r13,16 + jge NEAR $L$_no_extra_mask_498 + sub r12,r13 +$L$_no_extra_mask_498: + + + + vmovdqu64 xmm0,XMMWORD[16+r12] + vpand xmm3,xmm3,xmm0 + vpand xmm6,xmm6,xmm0 + vpshufb xmm6,xmm6,XMMWORD[SHUF_MASK] + vpshufb xmm6,xmm6,xmm5 + vpxorq xmm14,xmm14,xmm6 + cmp r13,0 + jl NEAR $L$_partial_incomplete_498 + + vpclmulqdq xmm7,xmm14,xmm4,0x11 + vpclmulqdq xmm10,xmm14,xmm4,0x00 + vpclmulqdq xmm11,xmm14,xmm4,0x01 + vpclmulqdq xmm14,xmm14,xmm4,0x10 + vpxorq xmm14,xmm14,xmm11 + + vpsrldq xmm11,xmm14,8 + vpslldq xmm14,xmm14,8 + vpxorq xmm7,xmm7,xmm11 + vpxorq xmm14,xmm14,xmm10 + + + + vmovdqu64 xmm11,XMMWORD[POLY2] + + vpclmulqdq xmm10,xmm11,xmm14,0x01 + vpslldq xmm10,xmm10,8 + vpxorq xmm14,xmm14,xmm10 + + + + vpclmulqdq xmm10,xmm11,xmm14,0x00 + vpsrldq xmm10,xmm10,4 + vpclmulqdq xmm14,xmm11,xmm14,0x10 + vpslldq xmm14,xmm14,4 + + vpternlogq xmm14,xmm7,xmm10,0x96 + + mov QWORD[r8],0 + + mov r12,r11 + mov r11,16 + sub r11,r12 + jmp NEAR $L$_enc_dec_done_498 + +$L$_partial_incomplete_498: + mov r12,QWORD[112+rbp] + add QWORD[r8],r12 + mov r11,QWORD[112+rbp] + +$L$_enc_dec_done_498: + + + lea r12,[byte_len_to_mask_table] + kmovw k1,[r11*2+r12] + vmovdqu64 XMMWORD[64+rdx],xmm14 + mov r12,QWORD[120+rbp] + vmovdqu8 XMMWORD[r12]{k1},xmm3 +$L$_partial_block_done_498: + vmovdqu64 xmm2,XMMWORD[rdx] + mov r13,QWORD[112+rbp] + sub r13,r11 + je NEAR $L$_enc_dec_done_497 + cmp r13,256 + jbe NEAR $L$_message_below_equal_16_blocks_497 + + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vmovdqa64 zmm27,ZMMWORD[ddq_addbe_4444] + vmovdqa64 zmm28,ZMMWORD[ddq_addbe_1234] + + + + + + + vmovd r15d,xmm2 + and r15d,255 + + vshufi64x2 zmm2,zmm2,zmm2,0 + vpshufb zmm2,zmm2,zmm29 + + + + cmp r15b,240 + jae NEAR $L$_next_16_overflow_499 + vpaddd zmm7,zmm2,zmm28 + vpaddd zmm10,zmm7,zmm27 + vpaddd zmm11,zmm10,zmm27 + vpaddd zmm12,zmm11,zmm27 + jmp NEAR $L$_next_16_ok_499 +$L$_next_16_overflow_499: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm12,ZMMWORD[ddq_add_4444] + vpaddd zmm7,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm10,zmm7,zmm12 + vpaddd zmm11,zmm10,zmm12 + vpaddd zmm12,zmm11,zmm12 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vpshufb zmm12,zmm12,zmm29 +$L$_next_16_ok_499: + vshufi64x2 zmm2,zmm12,zmm12,255 + add r15b,16 + + vmovdqu8 zmm0,ZMMWORD[r11*1+r9] + vmovdqu8 zmm3,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm4,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm5,ZMMWORD[192+r11*1+r9] + + + vbroadcastf64x2 zmm6,ZMMWORD[rcx] + vpxorq zmm7,zmm7,zmm6 + vpxorq zmm10,zmm10,zmm6 + vpxorq zmm11,zmm11,zmm6 + vpxorq zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[16+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[32+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[48+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[64+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[80+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[96+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[112+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[128+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[144+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[160+rcx] + vaesenclast zmm7,zmm7,zmm6 + vaesenclast zmm10,zmm10,zmm6 + vaesenclast zmm11,zmm11,zmm6 + vaesenclast zmm12,zmm12,zmm6 + + + vpxorq zmm7,zmm7,zmm0 + vpxorq zmm10,zmm10,zmm3 + vpxorq zmm11,zmm11,zmm4 + vpxorq zmm12,zmm12,zmm5 + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm7 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm10 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm11 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm12 + + vpshufb zmm7,zmm0,zmm29 + vpshufb zmm10,zmm3,zmm29 + vpshufb zmm11,zmm4,zmm29 + vpshufb zmm12,zmm5,zmm29 + vmovdqa64 ZMMWORD[768+rsp],zmm7 + vmovdqa64 ZMMWORD[832+rsp],zmm10 + vmovdqa64 ZMMWORD[896+rsp],zmm11 + vmovdqa64 ZMMWORD[960+rsp],zmm12 + test r14,r14 + jnz NEAR $L$_skip_hkeys_precomputation_500 + + vmovdqu64 zmm0,ZMMWORD[288+rdx] + vmovdqu64 ZMMWORD[704+rsp],zmm0 + + vmovdqu64 zmm3,ZMMWORD[224+rdx] + vmovdqu64 ZMMWORD[640+rsp],zmm3 + + + vshufi64x2 zmm3,zmm3,zmm3,0x00 + + vmovdqu64 zmm4,ZMMWORD[160+rdx] + vmovdqu64 ZMMWORD[576+rsp],zmm4 + + vmovdqu64 zmm5,ZMMWORD[96+rdx] + vmovdqu64 ZMMWORD[512+rsp],zmm5 +$L$_skip_hkeys_precomputation_500: + cmp r13,512 + jb NEAR $L$_message_below_32_blocks_497 + + + + cmp r15b,240 + jae NEAR $L$_next_16_overflow_501 + vpaddd zmm7,zmm2,zmm28 + vpaddd zmm10,zmm7,zmm27 + vpaddd zmm11,zmm10,zmm27 + vpaddd zmm12,zmm11,zmm27 + jmp NEAR $L$_next_16_ok_501 +$L$_next_16_overflow_501: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm12,ZMMWORD[ddq_add_4444] + vpaddd zmm7,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm10,zmm7,zmm12 + vpaddd zmm11,zmm10,zmm12 + vpaddd zmm12,zmm11,zmm12 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vpshufb zmm12,zmm12,zmm29 +$L$_next_16_ok_501: + vshufi64x2 zmm2,zmm12,zmm12,255 + add r15b,16 + + vmovdqu8 zmm0,ZMMWORD[256+r11*1+r9] + vmovdqu8 zmm3,ZMMWORD[320+r11*1+r9] + vmovdqu8 zmm4,ZMMWORD[384+r11*1+r9] + vmovdqu8 zmm5,ZMMWORD[448+r11*1+r9] + + + vbroadcastf64x2 zmm6,ZMMWORD[rcx] + vpxorq zmm7,zmm7,zmm6 + vpxorq zmm10,zmm10,zmm6 + vpxorq zmm11,zmm11,zmm6 + vpxorq zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[16+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[32+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[48+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[64+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[80+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[96+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[112+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[128+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[144+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[160+rcx] + vaesenclast zmm7,zmm7,zmm6 + vaesenclast zmm10,zmm10,zmm6 + vaesenclast zmm11,zmm11,zmm6 + vaesenclast zmm12,zmm12,zmm6 + + + vpxorq zmm7,zmm7,zmm0 + vpxorq zmm10,zmm10,zmm3 + vpxorq zmm11,zmm11,zmm4 + vpxorq zmm12,zmm12,zmm5 + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[256+r11*1+r10],zmm7 + vmovdqu8 ZMMWORD[320+r11*1+r10],zmm10 + vmovdqu8 ZMMWORD[384+r11*1+r10],zmm11 + vmovdqu8 ZMMWORD[448+r11*1+r10],zmm12 + + vpshufb zmm7,zmm0,zmm29 + vpshufb zmm10,zmm3,zmm29 + vpshufb zmm11,zmm4,zmm29 + vpshufb zmm12,zmm5,zmm29 + vmovdqa64 ZMMWORD[1024+rsp],zmm7 + vmovdqa64 ZMMWORD[1088+rsp],zmm10 + vmovdqa64 ZMMWORD[1152+rsp],zmm11 + vmovdqa64 ZMMWORD[1216+rsp],zmm12 + test r14,r14 + jnz NEAR $L$_skip_hkeys_precomputation_502 + vmovdqu64 zmm3,ZMMWORD[640+rsp] + + + vshufi64x2 zmm3,zmm3,zmm3,0x00 + + vmovdqu64 zmm4,ZMMWORD[576+rsp] + vmovdqu64 zmm5,ZMMWORD[512+rsp] + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[448+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[384+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[320+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[256+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[192+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[128+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[64+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[rsp],zmm5 +$L$_skip_hkeys_precomputation_502: + mov r14,1 + add r11,512 + sub r13,512 + + cmp r13,768 + jb NEAR $L$_no_more_big_nblocks_497 +$L$_encrypt_big_nblocks_497: + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_503 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_503 +$L$_16_blocks_overflow_503: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_503: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[192+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm26,zmm10,zmm15 + vpxorq zmm24,zmm6,zmm12 + vpxorq zmm25,zmm7,zmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm5 + vpshufb zmm0,zmm17,zmm29 + vpshufb zmm3,zmm19,zmm29 + vpshufb zmm4,zmm20,zmm29 + vpshufb zmm5,zmm21,zmm29 + vmovdqa64 ZMMWORD[1280+rsp],zmm0 + vmovdqa64 ZMMWORD[1344+rsp],zmm3 + vmovdqa64 ZMMWORD[1408+rsp],zmm4 + vmovdqa64 ZMMWORD[1472+rsp],zmm5 + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_504 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_504 +$L$_16_blocks_overflow_504: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_504: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[256+rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[320+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[384+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[448+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[256+r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[320+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[384+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[448+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vpternlogq zmm24,zmm6,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[256+r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[320+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[384+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[448+r11*1+r10],zmm5 + vpshufb zmm0,zmm17,zmm29 + vpshufb zmm3,zmm19,zmm29 + vpshufb zmm4,zmm20,zmm29 + vpshufb zmm5,zmm21,zmm29 + vmovdqa64 ZMMWORD[768+rsp],zmm0 + vmovdqa64 ZMMWORD[832+rsp],zmm3 + vmovdqa64 ZMMWORD[896+rsp],zmm4 + vmovdqa64 ZMMWORD[960+rsp],zmm5 + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_505 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_505 +$L$_16_blocks_overflow_505: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_505: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[512+r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[576+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[640+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[704+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + + + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpternlogq zmm6,zmm12,zmm15,0x96 + vpxorq zmm6,zmm6,zmm24 + vpternlogq zmm7,zmm13,zmm10,0x96 + vpxorq zmm7,zmm7,zmm25 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vextracti64x4 ymm12,zmm6,1 + vpxorq ymm6,ymm6,ymm12 + vextracti32x4 xmm12,ymm6,1 + vpxorq xmm6,xmm6,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm6,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[512+r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[576+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[640+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[704+r11*1+r10],zmm5 + vpshufb zmm0,zmm17,zmm29 + vpshufb zmm3,zmm19,zmm29 + vpshufb zmm4,zmm20,zmm29 + vpshufb zmm5,zmm21,zmm29 + vmovdqa64 ZMMWORD[1024+rsp],zmm0 + vmovdqa64 ZMMWORD[1088+rsp],zmm3 + vmovdqa64 ZMMWORD[1152+rsp],zmm4 + vmovdqa64 ZMMWORD[1216+rsp],zmm5 + vmovdqa64 zmm14,zmm6 + + add r11,768 + sub r13,768 + cmp r13,768 + jae NEAR $L$_encrypt_big_nblocks_497 + +$L$_no_more_big_nblocks_497: + + cmp r13,512 + jae NEAR $L$_encrypt_32_blocks_497 + + cmp r13,256 + jae NEAR $L$_encrypt_16_blocks_497 +$L$_encrypt_0_blocks_ghash_32_497: + mov r10d,r13d + and r10d,~15 + mov ebx,256 + sub ebx,r10d + vmovdqa64 zmm13,ZMMWORD[768+rsp] + vpxorq zmm13,zmm13,zmm14 + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[832+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpxorq zmm26,zmm4,zmm10 + vpxorq zmm24,zmm0,zmm6 + vpxorq zmm25,zmm3,zmm7 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[896+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[960+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + add ebx,256 + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_506 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_506 + jb NEAR $L$_last_num_blocks_is_7_1_506 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_506 + jb NEAR $L$_last_num_blocks_is_11_9_506 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_506 + ja NEAR $L$_last_num_blocks_is_16_506 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_506 + jmp NEAR $L$_last_num_blocks_is_13_506 + +$L$_last_num_blocks_is_11_9_506: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_506 + ja NEAR $L$_last_num_blocks_is_11_506 + jmp NEAR $L$_last_num_blocks_is_9_506 + +$L$_last_num_blocks_is_7_1_506: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_506 + jb NEAR $L$_last_num_blocks_is_3_1_506 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_506 + je NEAR $L$_last_num_blocks_is_6_506 + jmp NEAR $L$_last_num_blocks_is_5_506 + +$L$_last_num_blocks_is_3_1_506: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_506 + je NEAR $L$_last_num_blocks_is_2_506 +$L$_last_num_blocks_is_1_506: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_507 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_507 + +$L$_16_blocks_overflow_507: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_507: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm31 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb xmm17,xmm17,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_508 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_508 +$L$_small_initial_partial_block_508: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm0,XMMWORD[POLY2] + + + vpclmulqdq xmm3,xmm0,xmm25,0x01 + vpslldq xmm3,xmm3,8 + vpxorq xmm3,xmm25,xmm3 + + + vpclmulqdq xmm4,xmm0,xmm3,0x00 + vpsrldq xmm4,xmm4,4 + vpclmulqdq xmm14,xmm0,xmm3,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm4,xmm24,0x96 + + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_508 +$L$_small_initial_compute_done_508: +$L$_after_reduction_508: + jmp NEAR $L$_last_blocks_done_506 +$L$_last_num_blocks_is_2_506: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_509 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_509 + +$L$_16_blocks_overflow_509: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_509: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm31 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb ymm17,ymm17,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_510 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_510 +$L$_small_initial_partial_block_510: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_510: + + or r13,r13 + je NEAR $L$_after_reduction_510 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_510: + jmp NEAR $L$_last_blocks_done_506 +$L$_last_num_blocks_is_3_506: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_511 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_511 + +$L$_16_blocks_overflow_511: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_511: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_512 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_512 +$L$_small_initial_partial_block_512: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_512: + + or r13,r13 + je NEAR $L$_after_reduction_512 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_512: + jmp NEAR $L$_last_blocks_done_506 +$L$_last_num_blocks_is_4_506: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_513 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_513 + +$L$_16_blocks_overflow_513: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_513: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_514 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_514 +$L$_small_initial_partial_block_514: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_514: + + or r13,r13 + je NEAR $L$_after_reduction_514 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_514: + jmp NEAR $L$_last_blocks_done_506 +$L$_last_num_blocks_is_5_506: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_515 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_515 + +$L$_16_blocks_overflow_515: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_515: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb xmm19,xmm19,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_516 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_516 +$L$_small_initial_partial_block_516: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_516: + + or r13,r13 + je NEAR $L$_after_reduction_516 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_516: + jmp NEAR $L$_last_blocks_done_506 +$L$_last_num_blocks_is_6_506: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_517 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_517 + +$L$_16_blocks_overflow_517: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_517: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb ymm19,ymm19,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_518 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_518 +$L$_small_initial_partial_block_518: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_518: + + or r13,r13 + je NEAR $L$_after_reduction_518 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_518: + jmp NEAR $L$_last_blocks_done_506 +$L$_last_num_blocks_is_7_506: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_519 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_519 + +$L$_16_blocks_overflow_519: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_519: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_520 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_520 +$L$_small_initial_partial_block_520: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_520: + + or r13,r13 + je NEAR $L$_after_reduction_520 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_520: + jmp NEAR $L$_last_blocks_done_506 +$L$_last_num_blocks_is_8_506: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_521 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_521 + +$L$_16_blocks_overflow_521: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_521: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_522 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_522 +$L$_small_initial_partial_block_522: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_522: + + or r13,r13 + je NEAR $L$_after_reduction_522 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_522: + jmp NEAR $L$_last_blocks_done_506 +$L$_last_num_blocks_is_9_506: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_523 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_523 + +$L$_16_blocks_overflow_523: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_523: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb xmm20,xmm20,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_524 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_524 +$L$_small_initial_partial_block_524: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_524: + + or r13,r13 + je NEAR $L$_after_reduction_524 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_524: + jmp NEAR $L$_last_blocks_done_506 +$L$_last_num_blocks_is_10_506: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_525 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_525 + +$L$_16_blocks_overflow_525: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_525: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb ymm20,ymm20,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_526 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_526 +$L$_small_initial_partial_block_526: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_526: + + or r13,r13 + je NEAR $L$_after_reduction_526 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_526: + jmp NEAR $L$_last_blocks_done_506 +$L$_last_num_blocks_is_11_506: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_527 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_527 + +$L$_16_blocks_overflow_527: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_527: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_528 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_528 +$L$_small_initial_partial_block_528: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_528: + + or r13,r13 + je NEAR $L$_after_reduction_528 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_528: + jmp NEAR $L$_last_blocks_done_506 +$L$_last_num_blocks_is_12_506: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_529 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_529 + +$L$_16_blocks_overflow_529: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_529: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_530 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_530 +$L$_small_initial_partial_block_530: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_530: + + or r13,r13 + je NEAR $L$_after_reduction_530 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_530: + jmp NEAR $L$_last_blocks_done_506 +$L$_last_num_blocks_is_13_506: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_531 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_531 + +$L$_16_blocks_overflow_531: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_531: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb xmm21,xmm21,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_532 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_532 +$L$_small_initial_partial_block_532: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_532: + + or r13,r13 + je NEAR $L$_after_reduction_532 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_532: + jmp NEAR $L$_last_blocks_done_506 +$L$_last_num_blocks_is_14_506: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_533 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_533 + +$L$_16_blocks_overflow_533: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_533: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb ymm21,ymm21,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_534 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_534 +$L$_small_initial_partial_block_534: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_534: + + or r13,r13 + je NEAR $L$_after_reduction_534 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_534: + jmp NEAR $L$_last_blocks_done_506 +$L$_last_num_blocks_is_15_506: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_535 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_535 + +$L$_16_blocks_overflow_535: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_535: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_536 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_536 +$L$_small_initial_partial_block_536: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_536: + + or r13,r13 + je NEAR $L$_after_reduction_536 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_536: + jmp NEAR $L$_last_blocks_done_506 +$L$_last_num_blocks_is_16_506: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_537 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_537 + +$L$_16_blocks_overflow_537: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_537: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_538: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_538: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_538: + jmp NEAR $L$_last_blocks_done_506 +$L$_last_num_blocks_is_0_506: + vmovdqa64 zmm13,ZMMWORD[1024+rsp] + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1088+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1152+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1216+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_506: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_497 +$L$_encrypt_32_blocks_497: + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_539 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_539 +$L$_16_blocks_overflow_539: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_539: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[192+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm26,zmm10,zmm15 + vpxorq zmm24,zmm6,zmm12 + vpxorq zmm25,zmm7,zmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm5 + vpshufb zmm0,zmm17,zmm29 + vpshufb zmm3,zmm19,zmm29 + vpshufb zmm4,zmm20,zmm29 + vpshufb zmm5,zmm21,zmm29 + vmovdqa64 ZMMWORD[1280+rsp],zmm0 + vmovdqa64 ZMMWORD[1344+rsp],zmm3 + vmovdqa64 ZMMWORD[1408+rsp],zmm4 + vmovdqa64 ZMMWORD[1472+rsp],zmm5 + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_540 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_540 +$L$_16_blocks_overflow_540: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_540: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[256+rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[320+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[384+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[448+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[256+r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[320+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[384+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[448+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vpternlogq zmm24,zmm6,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[256+r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[320+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[384+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[448+r11*1+r10],zmm5 + vpshufb zmm0,zmm17,zmm29 + vpshufb zmm3,zmm19,zmm29 + vpshufb zmm4,zmm20,zmm29 + vpshufb zmm5,zmm21,zmm29 + vmovdqa64 ZMMWORD[768+rsp],zmm0 + vmovdqa64 ZMMWORD[832+rsp],zmm3 + vmovdqa64 ZMMWORD[896+rsp],zmm4 + vmovdqa64 ZMMWORD[960+rsp],zmm5 + vmovdqa64 zmm13,ZMMWORD[1280+rsp] + vmovdqu64 zmm12,ZMMWORD[512+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1344+rsp] + vmovdqu64 zmm12,ZMMWORD[576+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1408+rsp] + vmovdqu64 zmm12,ZMMWORD[640+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1472+rsp] + vmovdqu64 zmm12,ZMMWORD[704+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + + sub r13,512 + add r11,512 + mov r10d,r13d + and r10d,~15 + mov ebx,512 + sub ebx,r10d + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_541 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_541 + jb NEAR $L$_last_num_blocks_is_7_1_541 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_541 + jb NEAR $L$_last_num_blocks_is_11_9_541 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_541 + ja NEAR $L$_last_num_blocks_is_16_541 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_541 + jmp NEAR $L$_last_num_blocks_is_13_541 + +$L$_last_num_blocks_is_11_9_541: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_541 + ja NEAR $L$_last_num_blocks_is_11_541 + jmp NEAR $L$_last_num_blocks_is_9_541 + +$L$_last_num_blocks_is_7_1_541: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_541 + jb NEAR $L$_last_num_blocks_is_3_1_541 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_541 + je NEAR $L$_last_num_blocks_is_6_541 + jmp NEAR $L$_last_num_blocks_is_5_541 + +$L$_last_num_blocks_is_3_1_541: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_541 + je NEAR $L$_last_num_blocks_is_2_541 +$L$_last_num_blocks_is_1_541: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_542 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_542 + +$L$_16_blocks_overflow_542: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_542: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm31 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb xmm17,xmm17,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_543 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_543 +$L$_small_initial_partial_block_543: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm0,XMMWORD[POLY2] + + + vpclmulqdq xmm3,xmm0,xmm25,0x01 + vpslldq xmm3,xmm3,8 + vpxorq xmm3,xmm25,xmm3 + + + vpclmulqdq xmm4,xmm0,xmm3,0x00 + vpsrldq xmm4,xmm4,4 + vpclmulqdq xmm14,xmm0,xmm3,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm4,xmm24,0x96 + + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_543 +$L$_small_initial_compute_done_543: +$L$_after_reduction_543: + jmp NEAR $L$_last_blocks_done_541 +$L$_last_num_blocks_is_2_541: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_544 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_544 + +$L$_16_blocks_overflow_544: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_544: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm31 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb ymm17,ymm17,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_545 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_545 +$L$_small_initial_partial_block_545: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_545: + + or r13,r13 + je NEAR $L$_after_reduction_545 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_545: + jmp NEAR $L$_last_blocks_done_541 +$L$_last_num_blocks_is_3_541: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_546 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_546 + +$L$_16_blocks_overflow_546: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_546: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_547 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_547 +$L$_small_initial_partial_block_547: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_547: + + or r13,r13 + je NEAR $L$_after_reduction_547 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_547: + jmp NEAR $L$_last_blocks_done_541 +$L$_last_num_blocks_is_4_541: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_548 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_548 + +$L$_16_blocks_overflow_548: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_548: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_549 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_549 +$L$_small_initial_partial_block_549: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_549: + + or r13,r13 + je NEAR $L$_after_reduction_549 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_549: + jmp NEAR $L$_last_blocks_done_541 +$L$_last_num_blocks_is_5_541: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_550 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_550 + +$L$_16_blocks_overflow_550: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_550: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb xmm19,xmm19,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_551 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_551 +$L$_small_initial_partial_block_551: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_551: + + or r13,r13 + je NEAR $L$_after_reduction_551 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_551: + jmp NEAR $L$_last_blocks_done_541 +$L$_last_num_blocks_is_6_541: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_552 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_552 + +$L$_16_blocks_overflow_552: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_552: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb ymm19,ymm19,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_553 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_553 +$L$_small_initial_partial_block_553: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_553: + + or r13,r13 + je NEAR $L$_after_reduction_553 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_553: + jmp NEAR $L$_last_blocks_done_541 +$L$_last_num_blocks_is_7_541: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_554 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_554 + +$L$_16_blocks_overflow_554: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_554: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_555 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_555 +$L$_small_initial_partial_block_555: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_555: + + or r13,r13 + je NEAR $L$_after_reduction_555 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_555: + jmp NEAR $L$_last_blocks_done_541 +$L$_last_num_blocks_is_8_541: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_556 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_556 + +$L$_16_blocks_overflow_556: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_556: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_557 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_557 +$L$_small_initial_partial_block_557: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_557: + + or r13,r13 + je NEAR $L$_after_reduction_557 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_557: + jmp NEAR $L$_last_blocks_done_541 +$L$_last_num_blocks_is_9_541: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_558 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_558 + +$L$_16_blocks_overflow_558: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_558: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb xmm20,xmm20,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_559 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_559 +$L$_small_initial_partial_block_559: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_559: + + or r13,r13 + je NEAR $L$_after_reduction_559 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_559: + jmp NEAR $L$_last_blocks_done_541 +$L$_last_num_blocks_is_10_541: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_560 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_560 + +$L$_16_blocks_overflow_560: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_560: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb ymm20,ymm20,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_561 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_561 +$L$_small_initial_partial_block_561: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_561: + + or r13,r13 + je NEAR $L$_after_reduction_561 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_561: + jmp NEAR $L$_last_blocks_done_541 +$L$_last_num_blocks_is_11_541: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_562 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_562 + +$L$_16_blocks_overflow_562: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_562: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_563 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_563 +$L$_small_initial_partial_block_563: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_563: + + or r13,r13 + je NEAR $L$_after_reduction_563 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_563: + jmp NEAR $L$_last_blocks_done_541 +$L$_last_num_blocks_is_12_541: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_564 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_564 + +$L$_16_blocks_overflow_564: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_564: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_565 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_565 +$L$_small_initial_partial_block_565: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_565: + + or r13,r13 + je NEAR $L$_after_reduction_565 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_565: + jmp NEAR $L$_last_blocks_done_541 +$L$_last_num_blocks_is_13_541: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_566 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_566 + +$L$_16_blocks_overflow_566: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_566: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb xmm21,xmm21,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_567 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_567 +$L$_small_initial_partial_block_567: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_567: + + or r13,r13 + je NEAR $L$_after_reduction_567 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_567: + jmp NEAR $L$_last_blocks_done_541 +$L$_last_num_blocks_is_14_541: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_568 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_568 + +$L$_16_blocks_overflow_568: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_568: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb ymm21,ymm21,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_569 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_569 +$L$_small_initial_partial_block_569: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_569: + + or r13,r13 + je NEAR $L$_after_reduction_569 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_569: + jmp NEAR $L$_last_blocks_done_541 +$L$_last_num_blocks_is_15_541: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_570 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_570 + +$L$_16_blocks_overflow_570: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_570: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_571 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_571 +$L$_small_initial_partial_block_571: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_571: + + or r13,r13 + je NEAR $L$_after_reduction_571 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_571: + jmp NEAR $L$_last_blocks_done_541 +$L$_last_num_blocks_is_16_541: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_572 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_572 + +$L$_16_blocks_overflow_572: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_572: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_573: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_573: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_573: + jmp NEAR $L$_last_blocks_done_541 +$L$_last_num_blocks_is_0_541: + vmovdqa64 zmm13,ZMMWORD[768+rsp] + vpxorq zmm13,zmm13,zmm14 + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[832+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpxorq zmm26,zmm4,zmm10 + vpxorq zmm24,zmm0,zmm6 + vpxorq zmm25,zmm3,zmm7 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[896+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[960+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_541: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_497 +$L$_encrypt_16_blocks_497: + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_574 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_574 +$L$_16_blocks_overflow_574: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_574: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[192+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm26,zmm10,zmm15 + vpxorq zmm24,zmm6,zmm12 + vpxorq zmm25,zmm7,zmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm5 + vpshufb zmm0,zmm17,zmm29 + vpshufb zmm3,zmm19,zmm29 + vpshufb zmm4,zmm20,zmm29 + vpshufb zmm5,zmm21,zmm29 + vmovdqa64 ZMMWORD[1280+rsp],zmm0 + vmovdqa64 ZMMWORD[1344+rsp],zmm3 + vmovdqa64 ZMMWORD[1408+rsp],zmm4 + vmovdqa64 ZMMWORD[1472+rsp],zmm5 + vmovdqa64 zmm13,ZMMWORD[1024+rsp] + vmovdqu64 zmm12,ZMMWORD[256+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1088+rsp] + vmovdqu64 zmm12,ZMMWORD[320+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1152+rsp] + vmovdqu64 zmm12,ZMMWORD[384+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1216+rsp] + vmovdqu64 zmm12,ZMMWORD[448+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + sub r13,256 + add r11,256 + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_575 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_575 + jb NEAR $L$_last_num_blocks_is_7_1_575 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_575 + jb NEAR $L$_last_num_blocks_is_11_9_575 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_575 + ja NEAR $L$_last_num_blocks_is_16_575 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_575 + jmp NEAR $L$_last_num_blocks_is_13_575 + +$L$_last_num_blocks_is_11_9_575: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_575 + ja NEAR $L$_last_num_blocks_is_11_575 + jmp NEAR $L$_last_num_blocks_is_9_575 + +$L$_last_num_blocks_is_7_1_575: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_575 + jb NEAR $L$_last_num_blocks_is_3_1_575 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_575 + je NEAR $L$_last_num_blocks_is_6_575 + jmp NEAR $L$_last_num_blocks_is_5_575 + +$L$_last_num_blocks_is_3_1_575: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_575 + je NEAR $L$_last_num_blocks_is_2_575 +$L$_last_num_blocks_is_1_575: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_576 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_576 + +$L$_16_blocks_overflow_576: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_576: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc xmm0,xmm0,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb xmm17,xmm17,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_577 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_577 +$L$_small_initial_partial_block_577: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_577 +$L$_small_initial_compute_done_577: +$L$_after_reduction_577: + jmp NEAR $L$_last_blocks_done_575 +$L$_last_num_blocks_is_2_575: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_578 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_578 + +$L$_16_blocks_overflow_578: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_578: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc ymm0,ymm0,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb ymm17,ymm17,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_579 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_579 +$L$_small_initial_partial_block_579: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_579: + + or r13,r13 + je NEAR $L$_after_reduction_579 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_579: + jmp NEAR $L$_last_blocks_done_575 +$L$_last_num_blocks_is_3_575: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_580 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_580 + +$L$_16_blocks_overflow_580: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_580: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_581 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_581 +$L$_small_initial_partial_block_581: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_581: + + or r13,r13 + je NEAR $L$_after_reduction_581 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_581: + jmp NEAR $L$_last_blocks_done_575 +$L$_last_num_blocks_is_4_575: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_582 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_582 + +$L$_16_blocks_overflow_582: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_582: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_583 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_583 +$L$_small_initial_partial_block_583: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_583: + + or r13,r13 + je NEAR $L$_after_reduction_583 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_583: + jmp NEAR $L$_last_blocks_done_575 +$L$_last_num_blocks_is_5_575: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_584 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_584 + +$L$_16_blocks_overflow_584: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_584: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb xmm19,xmm19,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_585 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_585 +$L$_small_initial_partial_block_585: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_585: + + or r13,r13 + je NEAR $L$_after_reduction_585 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_585: + jmp NEAR $L$_last_blocks_done_575 +$L$_last_num_blocks_is_6_575: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_586 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_586 + +$L$_16_blocks_overflow_586: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_586: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb ymm19,ymm19,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_587 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_587 +$L$_small_initial_partial_block_587: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_587: + + or r13,r13 + je NEAR $L$_after_reduction_587 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_587: + jmp NEAR $L$_last_blocks_done_575 +$L$_last_num_blocks_is_7_575: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_588 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_588 + +$L$_16_blocks_overflow_588: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_588: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_589 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_589 +$L$_small_initial_partial_block_589: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_589: + + or r13,r13 + je NEAR $L$_after_reduction_589 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_589: + jmp NEAR $L$_last_blocks_done_575 +$L$_last_num_blocks_is_8_575: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_590 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_590 + +$L$_16_blocks_overflow_590: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_590: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_591 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_591 +$L$_small_initial_partial_block_591: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_591: + + or r13,r13 + je NEAR $L$_after_reduction_591 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_591: + jmp NEAR $L$_last_blocks_done_575 +$L$_last_num_blocks_is_9_575: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_592 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_592 + +$L$_16_blocks_overflow_592: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_592: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb xmm20,xmm20,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_593 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_593 +$L$_small_initial_partial_block_593: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_593: + + or r13,r13 + je NEAR $L$_after_reduction_593 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_593: + jmp NEAR $L$_last_blocks_done_575 +$L$_last_num_blocks_is_10_575: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_594 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_594 + +$L$_16_blocks_overflow_594: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_594: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb ymm20,ymm20,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_595 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_595 +$L$_small_initial_partial_block_595: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_595: + + or r13,r13 + je NEAR $L$_after_reduction_595 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_595: + jmp NEAR $L$_last_blocks_done_575 +$L$_last_num_blocks_is_11_575: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_596 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_596 + +$L$_16_blocks_overflow_596: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_596: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_597 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_597 +$L$_small_initial_partial_block_597: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_597: + + or r13,r13 + je NEAR $L$_after_reduction_597 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_597: + jmp NEAR $L$_last_blocks_done_575 +$L$_last_num_blocks_is_12_575: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_598 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_598 + +$L$_16_blocks_overflow_598: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_598: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_599 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_599 +$L$_small_initial_partial_block_599: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_599: + + or r13,r13 + je NEAR $L$_after_reduction_599 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_599: + jmp NEAR $L$_last_blocks_done_575 +$L$_last_num_blocks_is_13_575: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_600 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_600 + +$L$_16_blocks_overflow_600: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_600: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb xmm21,xmm21,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_601 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_601 +$L$_small_initial_partial_block_601: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_601: + + or r13,r13 + je NEAR $L$_after_reduction_601 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_601: + jmp NEAR $L$_last_blocks_done_575 +$L$_last_num_blocks_is_14_575: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_602 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_602 + +$L$_16_blocks_overflow_602: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_602: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb ymm21,ymm21,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_603 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_603 +$L$_small_initial_partial_block_603: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_603: + + or r13,r13 + je NEAR $L$_after_reduction_603 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_603: + jmp NEAR $L$_last_blocks_done_575 +$L$_last_num_blocks_is_15_575: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_604 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_604 + +$L$_16_blocks_overflow_604: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_604: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_605 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_605 +$L$_small_initial_partial_block_605: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_605: + + or r13,r13 + je NEAR $L$_after_reduction_605 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_605: + jmp NEAR $L$_last_blocks_done_575 +$L$_last_num_blocks_is_16_575: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_606 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_606 + +$L$_16_blocks_overflow_606: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_606: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_607: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_607: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_607: + jmp NEAR $L$_last_blocks_done_575 +$L$_last_num_blocks_is_0_575: + vmovdqa64 zmm13,ZMMWORD[1280+rsp] + vmovdqu64 zmm12,ZMMWORD[512+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1344+rsp] + vmovdqu64 zmm12,ZMMWORD[576+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1408+rsp] + vmovdqu64 zmm12,ZMMWORD[640+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1472+rsp] + vmovdqu64 zmm12,ZMMWORD[704+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_575: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_497 + +$L$_message_below_32_blocks_497: + + + sub r13,256 + add r11,256 + mov r10d,r13d + test r14,r14 + jnz NEAR $L$_skip_hkeys_precomputation_608 + vmovdqu64 zmm3,ZMMWORD[640+rsp] + + + vshufi64x2 zmm3,zmm3,zmm3,0x00 + + vmovdqu64 zmm4,ZMMWORD[576+rsp] + vmovdqu64 zmm5,ZMMWORD[512+rsp] + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[448+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[384+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[320+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[256+rsp],zmm5 +$L$_skip_hkeys_precomputation_608: + mov r14,1 + and r10d,~15 + mov ebx,512 + sub ebx,r10d + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_609 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_609 + jb NEAR $L$_last_num_blocks_is_7_1_609 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_609 + jb NEAR $L$_last_num_blocks_is_11_9_609 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_609 + ja NEAR $L$_last_num_blocks_is_16_609 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_609 + jmp NEAR $L$_last_num_blocks_is_13_609 + +$L$_last_num_blocks_is_11_9_609: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_609 + ja NEAR $L$_last_num_blocks_is_11_609 + jmp NEAR $L$_last_num_blocks_is_9_609 + +$L$_last_num_blocks_is_7_1_609: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_609 + jb NEAR $L$_last_num_blocks_is_3_1_609 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_609 + je NEAR $L$_last_num_blocks_is_6_609 + jmp NEAR $L$_last_num_blocks_is_5_609 + +$L$_last_num_blocks_is_3_1_609: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_609 + je NEAR $L$_last_num_blocks_is_2_609 +$L$_last_num_blocks_is_1_609: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_610 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_610 + +$L$_16_blocks_overflow_610: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_610: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm31 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb xmm17,xmm17,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_611 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_611 +$L$_small_initial_partial_block_611: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm0,XMMWORD[POLY2] + + + vpclmulqdq xmm3,xmm0,xmm25,0x01 + vpslldq xmm3,xmm3,8 + vpxorq xmm3,xmm25,xmm3 + + + vpclmulqdq xmm4,xmm0,xmm3,0x00 + vpsrldq xmm4,xmm4,4 + vpclmulqdq xmm14,xmm0,xmm3,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm4,xmm24,0x96 + + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_611 +$L$_small_initial_compute_done_611: +$L$_after_reduction_611: + jmp NEAR $L$_last_blocks_done_609 +$L$_last_num_blocks_is_2_609: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_612 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_612 + +$L$_16_blocks_overflow_612: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_612: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm31 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb ymm17,ymm17,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_613 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_613 +$L$_small_initial_partial_block_613: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_613: + + or r13,r13 + je NEAR $L$_after_reduction_613 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_613: + jmp NEAR $L$_last_blocks_done_609 +$L$_last_num_blocks_is_3_609: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_614 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_614 + +$L$_16_blocks_overflow_614: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_614: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_615 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_615 +$L$_small_initial_partial_block_615: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_615: + + or r13,r13 + je NEAR $L$_after_reduction_615 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_615: + jmp NEAR $L$_last_blocks_done_609 +$L$_last_num_blocks_is_4_609: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_616 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_616 + +$L$_16_blocks_overflow_616: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_616: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_617 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_617 +$L$_small_initial_partial_block_617: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_617: + + or r13,r13 + je NEAR $L$_after_reduction_617 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_617: + jmp NEAR $L$_last_blocks_done_609 +$L$_last_num_blocks_is_5_609: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_618 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_618 + +$L$_16_blocks_overflow_618: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_618: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb xmm19,xmm19,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_619 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_619 +$L$_small_initial_partial_block_619: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_619: + + or r13,r13 + je NEAR $L$_after_reduction_619 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_619: + jmp NEAR $L$_last_blocks_done_609 +$L$_last_num_blocks_is_6_609: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_620 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_620 + +$L$_16_blocks_overflow_620: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_620: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb ymm19,ymm19,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_621 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_621 +$L$_small_initial_partial_block_621: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_621: + + or r13,r13 + je NEAR $L$_after_reduction_621 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_621: + jmp NEAR $L$_last_blocks_done_609 +$L$_last_num_blocks_is_7_609: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_622 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_622 + +$L$_16_blocks_overflow_622: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_622: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_623 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_623 +$L$_small_initial_partial_block_623: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_623: + + or r13,r13 + je NEAR $L$_after_reduction_623 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_623: + jmp NEAR $L$_last_blocks_done_609 +$L$_last_num_blocks_is_8_609: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_624 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_624 + +$L$_16_blocks_overflow_624: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_624: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_625 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_625 +$L$_small_initial_partial_block_625: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_625: + + or r13,r13 + je NEAR $L$_after_reduction_625 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_625: + jmp NEAR $L$_last_blocks_done_609 +$L$_last_num_blocks_is_9_609: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_626 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_626 + +$L$_16_blocks_overflow_626: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_626: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb xmm20,xmm20,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_627 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_627 +$L$_small_initial_partial_block_627: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_627: + + or r13,r13 + je NEAR $L$_after_reduction_627 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_627: + jmp NEAR $L$_last_blocks_done_609 +$L$_last_num_blocks_is_10_609: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_628 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_628 + +$L$_16_blocks_overflow_628: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_628: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb ymm20,ymm20,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_629 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_629 +$L$_small_initial_partial_block_629: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_629: + + or r13,r13 + je NEAR $L$_after_reduction_629 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_629: + jmp NEAR $L$_last_blocks_done_609 +$L$_last_num_blocks_is_11_609: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_630 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_630 + +$L$_16_blocks_overflow_630: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_630: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_631 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_631 +$L$_small_initial_partial_block_631: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_631: + + or r13,r13 + je NEAR $L$_after_reduction_631 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_631: + jmp NEAR $L$_last_blocks_done_609 +$L$_last_num_blocks_is_12_609: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_632 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_632 + +$L$_16_blocks_overflow_632: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_632: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_633 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_633 +$L$_small_initial_partial_block_633: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_633: + + or r13,r13 + je NEAR $L$_after_reduction_633 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_633: + jmp NEAR $L$_last_blocks_done_609 +$L$_last_num_blocks_is_13_609: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_634 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_634 + +$L$_16_blocks_overflow_634: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_634: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb xmm21,xmm21,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_635 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_635 +$L$_small_initial_partial_block_635: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_635: + + or r13,r13 + je NEAR $L$_after_reduction_635 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_635: + jmp NEAR $L$_last_blocks_done_609 +$L$_last_num_blocks_is_14_609: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_636 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_636 + +$L$_16_blocks_overflow_636: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_636: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb ymm21,ymm21,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_637 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_637 +$L$_small_initial_partial_block_637: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_637: + + or r13,r13 + je NEAR $L$_after_reduction_637 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_637: + jmp NEAR $L$_last_blocks_done_609 +$L$_last_num_blocks_is_15_609: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_638 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_638 + +$L$_16_blocks_overflow_638: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_638: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_639 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_639 +$L$_small_initial_partial_block_639: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_639: + + or r13,r13 + je NEAR $L$_after_reduction_639 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_639: + jmp NEAR $L$_last_blocks_done_609 +$L$_last_num_blocks_is_16_609: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_640 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_640 + +$L$_16_blocks_overflow_640: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_640: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_641: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_641: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_641: + jmp NEAR $L$_last_blocks_done_609 +$L$_last_num_blocks_is_0_609: + vmovdqa64 zmm13,ZMMWORD[768+rsp] + vpxorq zmm13,zmm13,zmm14 + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[832+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpxorq zmm26,zmm4,zmm10 + vpxorq zmm24,zmm0,zmm6 + vpxorq zmm25,zmm3,zmm7 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[896+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[960+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_609: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_497 + +$L$_message_below_equal_16_blocks_497: + + + mov r12d,r13d + add r12d,15 + shr r12d,4 + cmp r12,8 + je NEAR $L$_small_initial_num_blocks_is_8_642 + jl NEAR $L$_small_initial_num_blocks_is_7_1_642 + + + cmp r12,12 + je NEAR $L$_small_initial_num_blocks_is_12_642 + jl NEAR $L$_small_initial_num_blocks_is_11_9_642 + + + cmp r12,16 + je NEAR $L$_small_initial_num_blocks_is_16_642 + cmp r12,15 + je NEAR $L$_small_initial_num_blocks_is_15_642 + cmp r12,14 + je NEAR $L$_small_initial_num_blocks_is_14_642 + jmp NEAR $L$_small_initial_num_blocks_is_13_642 + +$L$_small_initial_num_blocks_is_11_9_642: + + cmp r12,11 + je NEAR $L$_small_initial_num_blocks_is_11_642 + cmp r12,10 + je NEAR $L$_small_initial_num_blocks_is_10_642 + jmp NEAR $L$_small_initial_num_blocks_is_9_642 + +$L$_small_initial_num_blocks_is_7_1_642: + cmp r12,4 + je NEAR $L$_small_initial_num_blocks_is_4_642 + jl NEAR $L$_small_initial_num_blocks_is_3_1_642 + + cmp r12,7 + je NEAR $L$_small_initial_num_blocks_is_7_642 + cmp r12,6 + je NEAR $L$_small_initial_num_blocks_is_6_642 + jmp NEAR $L$_small_initial_num_blocks_is_5_642 + +$L$_small_initial_num_blocks_is_3_1_642: + + cmp r12,3 + je NEAR $L$_small_initial_num_blocks_is_3_642 + cmp r12,2 + je NEAR $L$_small_initial_num_blocks_is_2_642 + + + + + +$L$_small_initial_num_blocks_is_1_642: + vmovdqa64 xmm29,XMMWORD[SHUF_MASK] + vpaddd xmm0,xmm2,XMMWORD[ONE] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,0 + vpshufb xmm0,xmm0,xmm29 + vmovdqu8 xmm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast xmm0,xmm0,xmm15 + vpxorq xmm0,xmm0,xmm6 + vextracti32x4 xmm12,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb xmm6,xmm6,xmm29 + vextracti32x4 xmm13,zmm6,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_643 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm6,xmm20,0x01 + vpclmulqdq xmm5,xmm6,xmm20,0x10 + vpclmulqdq xmm0,xmm6,xmm20,0x11 + vpclmulqdq xmm3,xmm6,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_643 +$L$_small_initial_partial_block_643: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm13 + + jmp NEAR $L$_after_reduction_643 +$L$_small_initial_compute_done_643: +$L$_after_reduction_643: + jmp NEAR $L$_small_initial_blocks_encrypted_642 +$L$_small_initial_num_blocks_is_2_642: + vmovdqa64 ymm29,YMMWORD[SHUF_MASK] + vshufi64x2 ymm0,ymm2,ymm2,0 + vpaddd ymm0,ymm0,YMMWORD[ddq_add_1234] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,1 + vpshufb ymm0,ymm0,ymm29 + vmovdqu8 ymm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast ymm0,ymm0,ymm15 + vpxorq ymm0,ymm0,ymm6 + vextracti32x4 xmm12,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb ymm6,ymm6,ymm29 + vextracti32x4 xmm13,zmm6,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_644 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm6,ymm20,0x01 + vpclmulqdq ymm5,ymm6,ymm20,0x10 + vpclmulqdq ymm0,ymm6,ymm20,0x11 + vpclmulqdq ymm3,ymm6,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_644 +$L$_small_initial_partial_block_644: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm6,xmm20,0x01 + vpclmulqdq xmm5,xmm6,xmm20,0x10 + vpclmulqdq xmm0,xmm6,xmm20,0x11 + vpclmulqdq xmm3,xmm6,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_644: + + or r13,r13 + je NEAR $L$_after_reduction_644 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_644: + jmp NEAR $L$_small_initial_blocks_encrypted_642 +$L$_small_initial_num_blocks_is_3_642: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,2 + vpshufb zmm0,zmm0,zmm29 + vmovdqu8 zmm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vpxorq zmm0,zmm0,zmm6 + vextracti32x4 xmm12,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm6,zmm6,zmm29 + vextracti32x4 xmm13,zmm6,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_645 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_645 +$L$_small_initial_partial_block_645: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm6,ymm20,0x01 + vpclmulqdq ymm5,ymm6,ymm20,0x10 + vpclmulqdq ymm0,ymm6,ymm20,0x11 + vpclmulqdq ymm3,ymm6,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_645: + + or r13,r13 + je NEAR $L$_after_reduction_645 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_645: + jmp NEAR $L$_small_initial_blocks_encrypted_642 +$L$_small_initial_num_blocks_is_4_642: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,3 + vpshufb zmm0,zmm0,zmm29 + vmovdqu8 zmm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vpxorq zmm0,zmm0,zmm6 + vextracti32x4 xmm12,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm6,zmm6,zmm29 + vextracti32x4 xmm13,zmm6,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_646 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_646 +$L$_small_initial_partial_block_646: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_646: + + or r13,r13 + je NEAR $L$_after_reduction_646 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_646: + jmp NEAR $L$_small_initial_blocks_encrypted_642 +$L$_small_initial_num_blocks_is_5_642: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,0 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 xmm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast xmm3,xmm3,xmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq xmm3,xmm3,xmm7 + vextracti32x4 xmm12,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm6,zmm29 + vpshufb xmm7,xmm7,xmm29 + vextracti32x4 xmm13,zmm7,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_647 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm7,xmm20,0x01 + vpclmulqdq xmm5,xmm7,xmm20,0x10 + vpclmulqdq xmm0,xmm7,xmm20,0x11 + vpclmulqdq xmm3,xmm7,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_647 +$L$_small_initial_partial_block_647: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_647: + + or r13,r13 + je NEAR $L$_after_reduction_647 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_647: + jmp NEAR $L$_small_initial_blocks_encrypted_642 +$L$_small_initial_num_blocks_is_6_642: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,1 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 ymm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast ymm3,ymm3,ymm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq ymm3,ymm3,ymm7 + vextracti32x4 xmm12,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm6,zmm29 + vpshufb ymm7,ymm7,ymm29 + vextracti32x4 xmm13,zmm7,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_648 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm7,ymm20,0x01 + vpclmulqdq ymm5,ymm7,ymm20,0x10 + vpclmulqdq ymm0,ymm7,ymm20,0x11 + vpclmulqdq ymm3,ymm7,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_648 +$L$_small_initial_partial_block_648: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm7,xmm20,0x01 + vpclmulqdq xmm5,xmm7,xmm20,0x10 + vpclmulqdq xmm0,xmm7,xmm20,0x11 + vpclmulqdq xmm3,xmm7,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_648: + + or r13,r13 + je NEAR $L$_after_reduction_648 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_648: + jmp NEAR $L$_small_initial_blocks_encrypted_642 +$L$_small_initial_num_blocks_is_7_642: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,2 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vextracti32x4 xmm12,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vextracti32x4 xmm13,zmm7,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_649 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm7,zmm20,0x01 + vpclmulqdq zmm5,zmm7,zmm20,0x10 + vpclmulqdq zmm0,zmm7,zmm20,0x11 + vpclmulqdq zmm3,zmm7,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_649 +$L$_small_initial_partial_block_649: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm7,ymm20,0x01 + vpclmulqdq ymm5,ymm7,ymm20,0x10 + vpclmulqdq ymm0,ymm7,ymm20,0x11 + vpclmulqdq ymm3,ymm7,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_649: + + or r13,r13 + je NEAR $L$_after_reduction_649 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_649: + jmp NEAR $L$_small_initial_blocks_encrypted_642 +$L$_small_initial_num_blocks_is_8_642: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,3 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vextracti32x4 xmm12,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vextracti32x4 xmm13,zmm7,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_650 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_650 +$L$_small_initial_partial_block_650: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm7,zmm20,0x01 + vpclmulqdq zmm5,zmm7,zmm20,0x10 + vpclmulqdq zmm0,zmm7,zmm20,0x11 + vpclmulqdq zmm3,zmm7,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_650: + + or r13,r13 + je NEAR $L$_after_reduction_650 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_650: + jmp NEAR $L$_small_initial_blocks_encrypted_642 +$L$_small_initial_num_blocks_is_9_642: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,0 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast xmm4,xmm4,xmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq xmm4,xmm4,xmm10 + vextracti32x4 xmm12,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb xmm10,xmm10,xmm29 + vextracti32x4 xmm13,zmm10,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_651 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm10,xmm20,0x01 + vpclmulqdq xmm5,xmm10,xmm20,0x10 + vpclmulqdq xmm0,xmm10,xmm20,0x11 + vpclmulqdq xmm3,xmm10,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_651 +$L$_small_initial_partial_block_651: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_651: + + or r13,r13 + je NEAR $L$_after_reduction_651 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_651: + jmp NEAR $L$_small_initial_blocks_encrypted_642 +$L$_small_initial_num_blocks_is_10_642: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,1 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast ymm4,ymm4,ymm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq ymm4,ymm4,ymm10 + vextracti32x4 xmm12,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb ymm10,ymm10,ymm29 + vextracti32x4 xmm13,zmm10,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_652 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm10,ymm20,0x01 + vpclmulqdq ymm5,ymm10,ymm20,0x10 + vpclmulqdq ymm0,ymm10,ymm20,0x11 + vpclmulqdq ymm3,ymm10,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_652 +$L$_small_initial_partial_block_652: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm10,xmm20,0x01 + vpclmulqdq xmm5,xmm10,xmm20,0x10 + vpclmulqdq xmm0,xmm10,xmm20,0x11 + vpclmulqdq xmm3,xmm10,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_652: + + or r13,r13 + je NEAR $L$_after_reduction_652 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_652: + jmp NEAR $L$_small_initial_blocks_encrypted_642 +$L$_small_initial_num_blocks_is_11_642: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,2 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vextracti32x4 xmm12,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vextracti32x4 xmm13,zmm10,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_653 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm10,zmm20,0x01 + vpclmulqdq zmm5,zmm10,zmm20,0x10 + vpclmulqdq zmm0,zmm10,zmm20,0x11 + vpclmulqdq zmm3,zmm10,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_653 +$L$_small_initial_partial_block_653: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm10,ymm20,0x01 + vpclmulqdq ymm5,ymm10,ymm20,0x10 + vpclmulqdq ymm0,ymm10,ymm20,0x11 + vpclmulqdq ymm3,ymm10,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_653: + + or r13,r13 + je NEAR $L$_after_reduction_653 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_653: + jmp NEAR $L$_small_initial_blocks_encrypted_642 +$L$_small_initial_num_blocks_is_12_642: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,3 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vextracti32x4 xmm12,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vextracti32x4 xmm13,zmm10,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_654 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_654 +$L$_small_initial_partial_block_654: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm10,zmm20,0x01 + vpclmulqdq zmm5,zmm10,zmm20,0x10 + vpclmulqdq zmm0,zmm10,zmm20,0x11 + vpclmulqdq zmm3,zmm10,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_654: + + or r13,r13 + je NEAR $L$_after_reduction_654 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_654: + jmp NEAR $L$_small_initial_blocks_encrypted_642 +$L$_small_initial_num_blocks_is_13_642: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,0 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast xmm5,xmm5,xmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq xmm5,xmm5,xmm11 + vextracti32x4 xmm12,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb xmm11,xmm11,xmm29 + vextracti32x4 xmm13,zmm11,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_655 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm11,xmm20,0x01 + vpclmulqdq xmm5,xmm11,xmm20,0x10 + vpclmulqdq xmm0,xmm11,xmm20,0x11 + vpclmulqdq xmm3,xmm11,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_655 +$L$_small_initial_partial_block_655: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_655: + + or r13,r13 + je NEAR $L$_after_reduction_655 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_655: + jmp NEAR $L$_small_initial_blocks_encrypted_642 +$L$_small_initial_num_blocks_is_14_642: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,1 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast ymm5,ymm5,ymm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq ymm5,ymm5,ymm11 + vextracti32x4 xmm12,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb ymm11,ymm11,ymm29 + vextracti32x4 xmm13,zmm11,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_656 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm11,ymm20,0x01 + vpclmulqdq ymm5,ymm11,ymm20,0x10 + vpclmulqdq ymm0,ymm11,ymm20,0x11 + vpclmulqdq ymm3,ymm11,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_656 +$L$_small_initial_partial_block_656: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm11,xmm20,0x01 + vpclmulqdq xmm5,xmm11,xmm20,0x10 + vpclmulqdq xmm0,xmm11,xmm20,0x11 + vpclmulqdq xmm3,xmm11,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_656: + + or r13,r13 + je NEAR $L$_after_reduction_656 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_656: + jmp NEAR $L$_small_initial_blocks_encrypted_642 +$L$_small_initial_num_blocks_is_15_642: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,2 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast zmm5,zmm5,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq zmm5,zmm5,zmm11 + vextracti32x4 xmm12,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vextracti32x4 xmm13,zmm11,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_657 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm11,zmm20,0x01 + vpclmulqdq zmm5,zmm11,zmm20,0x10 + vpclmulqdq zmm0,zmm11,zmm20,0x11 + vpclmulqdq zmm3,zmm11,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_657 +$L$_small_initial_partial_block_657: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm11,ymm20,0x01 + vpclmulqdq ymm5,ymm11,ymm20,0x10 + vpclmulqdq ymm0,ymm11,ymm20,0x11 + vpclmulqdq ymm3,ymm11,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_657: + + or r13,r13 + je NEAR $L$_after_reduction_657 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_657: + jmp NEAR $L$_small_initial_blocks_encrypted_642 +$L$_small_initial_num_blocks_is_16_642: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,3 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast zmm5,zmm5,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq zmm5,zmm5,zmm11 + vextracti32x4 xmm12,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vextracti32x4 xmm13,zmm11,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_658: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm11,zmm20,0x01 + vpclmulqdq zmm5,zmm11,zmm20,0x10 + vpclmulqdq zmm0,zmm11,zmm20,0x11 + vpclmulqdq zmm3,zmm11,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_658: + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_658: +$L$_small_initial_blocks_encrypted_642: +$L$_ghash_done_497: + vmovdqu64 XMMWORD[rdx],xmm2 + vmovdqu64 XMMWORD[64+rdx],xmm14 +$L$_enc_dec_done_497: + jmp NEAR $L$exit_gcm_decrypt +ALIGN 32 +$L$aes_gcm_decrypt_192_avx512: + cmp QWORD[112+rbp],0 + je NEAR $L$_enc_dec_done_659 + xor r14,r14 + vmovdqu64 xmm14,XMMWORD[64+rdx] + + mov r11,QWORD[r8] + or r11,r11 + je NEAR $L$_partial_block_done_660 + mov r10d,16 + lea r12,[byte_len_to_mask_table] + cmp QWORD[112+rbp],r10 + cmovc r10,QWORD[112+rbp] + add r12,r10 + add r12,r10 + kmovw k1,[r12] + vmovdqu8 xmm0{k1}{z},[r9] + + vmovdqu64 xmm3,XMMWORD[16+rdx] + vmovdqu64 xmm4,XMMWORD[336+rdx] + + + + lea r12,[SHIFT_MASK] + add r12,r11 + vmovdqu64 xmm5,XMMWORD[r12] + vpshufb xmm3,xmm3,xmm5 + + vmovdqa64 xmm6,xmm0 + vpxorq xmm3,xmm3,xmm0 + + + mov r13,QWORD[112+rbp] + add r13,r11 + sub r13,16 + jge NEAR $L$_no_extra_mask_660 + sub r12,r13 +$L$_no_extra_mask_660: + + + + vmovdqu64 xmm0,XMMWORD[16+r12] + vpand xmm3,xmm3,xmm0 + vpand xmm6,xmm6,xmm0 + vpshufb xmm6,xmm6,XMMWORD[SHUF_MASK] + vpshufb xmm6,xmm6,xmm5 + vpxorq xmm14,xmm14,xmm6 + cmp r13,0 + jl NEAR $L$_partial_incomplete_660 + + vpclmulqdq xmm7,xmm14,xmm4,0x11 + vpclmulqdq xmm10,xmm14,xmm4,0x00 + vpclmulqdq xmm11,xmm14,xmm4,0x01 + vpclmulqdq xmm14,xmm14,xmm4,0x10 + vpxorq xmm14,xmm14,xmm11 + + vpsrldq xmm11,xmm14,8 + vpslldq xmm14,xmm14,8 + vpxorq xmm7,xmm7,xmm11 + vpxorq xmm14,xmm14,xmm10 + + + + vmovdqu64 xmm11,XMMWORD[POLY2] + + vpclmulqdq xmm10,xmm11,xmm14,0x01 + vpslldq xmm10,xmm10,8 + vpxorq xmm14,xmm14,xmm10 + + + + vpclmulqdq xmm10,xmm11,xmm14,0x00 + vpsrldq xmm10,xmm10,4 + vpclmulqdq xmm14,xmm11,xmm14,0x10 + vpslldq xmm14,xmm14,4 + + vpternlogq xmm14,xmm7,xmm10,0x96 + + mov QWORD[r8],0 + + mov r12,r11 + mov r11,16 + sub r11,r12 + jmp NEAR $L$_enc_dec_done_660 + +$L$_partial_incomplete_660: + mov r12,QWORD[112+rbp] + add QWORD[r8],r12 + mov r11,QWORD[112+rbp] + +$L$_enc_dec_done_660: + + + lea r12,[byte_len_to_mask_table] + kmovw k1,[r11*2+r12] + vmovdqu64 XMMWORD[64+rdx],xmm14 + mov r12,QWORD[120+rbp] + vmovdqu8 XMMWORD[r12]{k1},xmm3 +$L$_partial_block_done_660: + vmovdqu64 xmm2,XMMWORD[rdx] + mov r13,QWORD[112+rbp] + sub r13,r11 + je NEAR $L$_enc_dec_done_659 + cmp r13,256 + jbe NEAR $L$_message_below_equal_16_blocks_659 + + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vmovdqa64 zmm27,ZMMWORD[ddq_addbe_4444] + vmovdqa64 zmm28,ZMMWORD[ddq_addbe_1234] + + + + + + + vmovd r15d,xmm2 + and r15d,255 + + vshufi64x2 zmm2,zmm2,zmm2,0 + vpshufb zmm2,zmm2,zmm29 + + + + cmp r15b,240 + jae NEAR $L$_next_16_overflow_661 + vpaddd zmm7,zmm2,zmm28 + vpaddd zmm10,zmm7,zmm27 + vpaddd zmm11,zmm10,zmm27 + vpaddd zmm12,zmm11,zmm27 + jmp NEAR $L$_next_16_ok_661 +$L$_next_16_overflow_661: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm12,ZMMWORD[ddq_add_4444] + vpaddd zmm7,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm10,zmm7,zmm12 + vpaddd zmm11,zmm10,zmm12 + vpaddd zmm12,zmm11,zmm12 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vpshufb zmm12,zmm12,zmm29 +$L$_next_16_ok_661: + vshufi64x2 zmm2,zmm12,zmm12,255 + add r15b,16 + + vmovdqu8 zmm0,ZMMWORD[r11*1+r9] + vmovdqu8 zmm3,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm4,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm5,ZMMWORD[192+r11*1+r9] + + + vbroadcastf64x2 zmm6,ZMMWORD[rcx] + vpxorq zmm7,zmm7,zmm6 + vpxorq zmm10,zmm10,zmm6 + vpxorq zmm11,zmm11,zmm6 + vpxorq zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[16+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[32+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[48+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[64+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[80+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[96+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[112+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[128+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[144+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[160+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[176+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[192+rcx] + vaesenclast zmm7,zmm7,zmm6 + vaesenclast zmm10,zmm10,zmm6 + vaesenclast zmm11,zmm11,zmm6 + vaesenclast zmm12,zmm12,zmm6 + + + vpxorq zmm7,zmm7,zmm0 + vpxorq zmm10,zmm10,zmm3 + vpxorq zmm11,zmm11,zmm4 + vpxorq zmm12,zmm12,zmm5 + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm7 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm10 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm11 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm12 + + vpshufb zmm7,zmm0,zmm29 + vpshufb zmm10,zmm3,zmm29 + vpshufb zmm11,zmm4,zmm29 + vpshufb zmm12,zmm5,zmm29 + vmovdqa64 ZMMWORD[768+rsp],zmm7 + vmovdqa64 ZMMWORD[832+rsp],zmm10 + vmovdqa64 ZMMWORD[896+rsp],zmm11 + vmovdqa64 ZMMWORD[960+rsp],zmm12 + test r14,r14 + jnz NEAR $L$_skip_hkeys_precomputation_662 + + vmovdqu64 zmm0,ZMMWORD[288+rdx] + vmovdqu64 ZMMWORD[704+rsp],zmm0 + + vmovdqu64 zmm3,ZMMWORD[224+rdx] + vmovdqu64 ZMMWORD[640+rsp],zmm3 + + + vshufi64x2 zmm3,zmm3,zmm3,0x00 + + vmovdqu64 zmm4,ZMMWORD[160+rdx] + vmovdqu64 ZMMWORD[576+rsp],zmm4 + + vmovdqu64 zmm5,ZMMWORD[96+rdx] + vmovdqu64 ZMMWORD[512+rsp],zmm5 +$L$_skip_hkeys_precomputation_662: + cmp r13,512 + jb NEAR $L$_message_below_32_blocks_659 + + + + cmp r15b,240 + jae NEAR $L$_next_16_overflow_663 + vpaddd zmm7,zmm2,zmm28 + vpaddd zmm10,zmm7,zmm27 + vpaddd zmm11,zmm10,zmm27 + vpaddd zmm12,zmm11,zmm27 + jmp NEAR $L$_next_16_ok_663 +$L$_next_16_overflow_663: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm12,ZMMWORD[ddq_add_4444] + vpaddd zmm7,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm10,zmm7,zmm12 + vpaddd zmm11,zmm10,zmm12 + vpaddd zmm12,zmm11,zmm12 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vpshufb zmm12,zmm12,zmm29 +$L$_next_16_ok_663: + vshufi64x2 zmm2,zmm12,zmm12,255 + add r15b,16 + + vmovdqu8 zmm0,ZMMWORD[256+r11*1+r9] + vmovdqu8 zmm3,ZMMWORD[320+r11*1+r9] + vmovdqu8 zmm4,ZMMWORD[384+r11*1+r9] + vmovdqu8 zmm5,ZMMWORD[448+r11*1+r9] + + + vbroadcastf64x2 zmm6,ZMMWORD[rcx] + vpxorq zmm7,zmm7,zmm6 + vpxorq zmm10,zmm10,zmm6 + vpxorq zmm11,zmm11,zmm6 + vpxorq zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[16+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[32+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[48+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[64+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[80+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[96+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[112+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[128+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[144+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[160+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[176+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[192+rcx] + vaesenclast zmm7,zmm7,zmm6 + vaesenclast zmm10,zmm10,zmm6 + vaesenclast zmm11,zmm11,zmm6 + vaesenclast zmm12,zmm12,zmm6 + + + vpxorq zmm7,zmm7,zmm0 + vpxorq zmm10,zmm10,zmm3 + vpxorq zmm11,zmm11,zmm4 + vpxorq zmm12,zmm12,zmm5 + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[256+r11*1+r10],zmm7 + vmovdqu8 ZMMWORD[320+r11*1+r10],zmm10 + vmovdqu8 ZMMWORD[384+r11*1+r10],zmm11 + vmovdqu8 ZMMWORD[448+r11*1+r10],zmm12 + + vpshufb zmm7,zmm0,zmm29 + vpshufb zmm10,zmm3,zmm29 + vpshufb zmm11,zmm4,zmm29 + vpshufb zmm12,zmm5,zmm29 + vmovdqa64 ZMMWORD[1024+rsp],zmm7 + vmovdqa64 ZMMWORD[1088+rsp],zmm10 + vmovdqa64 ZMMWORD[1152+rsp],zmm11 + vmovdqa64 ZMMWORD[1216+rsp],zmm12 + test r14,r14 + jnz NEAR $L$_skip_hkeys_precomputation_664 + vmovdqu64 zmm3,ZMMWORD[640+rsp] + + + vshufi64x2 zmm3,zmm3,zmm3,0x00 + + vmovdqu64 zmm4,ZMMWORD[576+rsp] + vmovdqu64 zmm5,ZMMWORD[512+rsp] + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[448+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[384+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[320+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[256+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[192+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[128+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[64+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[rsp],zmm5 +$L$_skip_hkeys_precomputation_664: + mov r14,1 + add r11,512 + sub r13,512 + + cmp r13,768 + jb NEAR $L$_no_more_big_nblocks_659 +$L$_encrypt_big_nblocks_659: + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_665 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_665 +$L$_16_blocks_overflow_665: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_665: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[192+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm26,zmm10,zmm15 + vpxorq zmm24,zmm6,zmm12 + vpxorq zmm25,zmm7,zmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm5 + vpshufb zmm0,zmm17,zmm29 + vpshufb zmm3,zmm19,zmm29 + vpshufb zmm4,zmm20,zmm29 + vpshufb zmm5,zmm21,zmm29 + vmovdqa64 ZMMWORD[1280+rsp],zmm0 + vmovdqa64 ZMMWORD[1344+rsp],zmm3 + vmovdqa64 ZMMWORD[1408+rsp],zmm4 + vmovdqa64 ZMMWORD[1472+rsp],zmm5 + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_666 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_666 +$L$_16_blocks_overflow_666: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_666: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[256+rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[320+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[384+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[448+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[256+r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[320+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[384+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[448+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vpternlogq zmm24,zmm6,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[256+r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[320+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[384+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[448+r11*1+r10],zmm5 + vpshufb zmm0,zmm17,zmm29 + vpshufb zmm3,zmm19,zmm29 + vpshufb zmm4,zmm20,zmm29 + vpshufb zmm5,zmm21,zmm29 + vmovdqa64 ZMMWORD[768+rsp],zmm0 + vmovdqa64 ZMMWORD[832+rsp],zmm3 + vmovdqa64 ZMMWORD[896+rsp],zmm4 + vmovdqa64 ZMMWORD[960+rsp],zmm5 + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_667 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_667 +$L$_16_blocks_overflow_667: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_667: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[512+r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[576+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[640+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[704+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + + + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpternlogq zmm6,zmm12,zmm15,0x96 + vpxorq zmm6,zmm6,zmm24 + vpternlogq zmm7,zmm13,zmm10,0x96 + vpxorq zmm7,zmm7,zmm25 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vextracti64x4 ymm12,zmm6,1 + vpxorq ymm6,ymm6,ymm12 + vextracti32x4 xmm12,ymm6,1 + vpxorq xmm6,xmm6,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm6,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[512+r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[576+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[640+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[704+r11*1+r10],zmm5 + vpshufb zmm0,zmm17,zmm29 + vpshufb zmm3,zmm19,zmm29 + vpshufb zmm4,zmm20,zmm29 + vpshufb zmm5,zmm21,zmm29 + vmovdqa64 ZMMWORD[1024+rsp],zmm0 + vmovdqa64 ZMMWORD[1088+rsp],zmm3 + vmovdqa64 ZMMWORD[1152+rsp],zmm4 + vmovdqa64 ZMMWORD[1216+rsp],zmm5 + vmovdqa64 zmm14,zmm6 + + add r11,768 + sub r13,768 + cmp r13,768 + jae NEAR $L$_encrypt_big_nblocks_659 + +$L$_no_more_big_nblocks_659: + + cmp r13,512 + jae NEAR $L$_encrypt_32_blocks_659 + + cmp r13,256 + jae NEAR $L$_encrypt_16_blocks_659 +$L$_encrypt_0_blocks_ghash_32_659: + mov r10d,r13d + and r10d,~15 + mov ebx,256 + sub ebx,r10d + vmovdqa64 zmm13,ZMMWORD[768+rsp] + vpxorq zmm13,zmm13,zmm14 + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[832+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpxorq zmm26,zmm4,zmm10 + vpxorq zmm24,zmm0,zmm6 + vpxorq zmm25,zmm3,zmm7 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[896+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[960+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + add ebx,256 + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_668 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_668 + jb NEAR $L$_last_num_blocks_is_7_1_668 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_668 + jb NEAR $L$_last_num_blocks_is_11_9_668 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_668 + ja NEAR $L$_last_num_blocks_is_16_668 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_668 + jmp NEAR $L$_last_num_blocks_is_13_668 + +$L$_last_num_blocks_is_11_9_668: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_668 + ja NEAR $L$_last_num_blocks_is_11_668 + jmp NEAR $L$_last_num_blocks_is_9_668 + +$L$_last_num_blocks_is_7_1_668: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_668 + jb NEAR $L$_last_num_blocks_is_3_1_668 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_668 + je NEAR $L$_last_num_blocks_is_6_668 + jmp NEAR $L$_last_num_blocks_is_5_668 + +$L$_last_num_blocks_is_3_1_668: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_668 + je NEAR $L$_last_num_blocks_is_2_668 +$L$_last_num_blocks_is_1_668: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_669 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_669 + +$L$_16_blocks_overflow_669: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_669: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc xmm0,xmm0,xmm31 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb xmm17,xmm17,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_670 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_670 +$L$_small_initial_partial_block_670: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm0,XMMWORD[POLY2] + + + vpclmulqdq xmm3,xmm0,xmm25,0x01 + vpslldq xmm3,xmm3,8 + vpxorq xmm3,xmm25,xmm3 + + + vpclmulqdq xmm4,xmm0,xmm3,0x00 + vpsrldq xmm4,xmm4,4 + vpclmulqdq xmm14,xmm0,xmm3,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm4,xmm24,0x96 + + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_670 +$L$_small_initial_compute_done_670: +$L$_after_reduction_670: + jmp NEAR $L$_last_blocks_done_668 +$L$_last_num_blocks_is_2_668: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_671 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_671 + +$L$_16_blocks_overflow_671: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_671: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc ymm0,ymm0,ymm31 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb ymm17,ymm17,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_672 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_672 +$L$_small_initial_partial_block_672: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_672: + + or r13,r13 + je NEAR $L$_after_reduction_672 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_672: + jmp NEAR $L$_last_blocks_done_668 +$L$_last_num_blocks_is_3_668: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_673 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_673 + +$L$_16_blocks_overflow_673: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_673: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_674 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_674 +$L$_small_initial_partial_block_674: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_674: + + or r13,r13 + je NEAR $L$_after_reduction_674 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_674: + jmp NEAR $L$_last_blocks_done_668 +$L$_last_num_blocks_is_4_668: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_675 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_675 + +$L$_16_blocks_overflow_675: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_675: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_676 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_676 +$L$_small_initial_partial_block_676: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_676: + + or r13,r13 + je NEAR $L$_after_reduction_676 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_676: + jmp NEAR $L$_last_blocks_done_668 +$L$_last_num_blocks_is_5_668: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_677 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_677 + +$L$_16_blocks_overflow_677: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_677: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb xmm19,xmm19,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_678 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_678 +$L$_small_initial_partial_block_678: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_678: + + or r13,r13 + je NEAR $L$_after_reduction_678 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_678: + jmp NEAR $L$_last_blocks_done_668 +$L$_last_num_blocks_is_6_668: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_679 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_679 + +$L$_16_blocks_overflow_679: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_679: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb ymm19,ymm19,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_680 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_680 +$L$_small_initial_partial_block_680: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_680: + + or r13,r13 + je NEAR $L$_after_reduction_680 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_680: + jmp NEAR $L$_last_blocks_done_668 +$L$_last_num_blocks_is_7_668: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_681 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_681 + +$L$_16_blocks_overflow_681: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_681: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_682 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_682 +$L$_small_initial_partial_block_682: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_682: + + or r13,r13 + je NEAR $L$_after_reduction_682 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_682: + jmp NEAR $L$_last_blocks_done_668 +$L$_last_num_blocks_is_8_668: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_683 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_683 + +$L$_16_blocks_overflow_683: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_683: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_684 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_684 +$L$_small_initial_partial_block_684: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_684: + + or r13,r13 + je NEAR $L$_after_reduction_684 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_684: + jmp NEAR $L$_last_blocks_done_668 +$L$_last_num_blocks_is_9_668: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_685 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_685 + +$L$_16_blocks_overflow_685: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_685: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb xmm20,xmm20,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_686 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_686 +$L$_small_initial_partial_block_686: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_686: + + or r13,r13 + je NEAR $L$_after_reduction_686 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_686: + jmp NEAR $L$_last_blocks_done_668 +$L$_last_num_blocks_is_10_668: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_687 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_687 + +$L$_16_blocks_overflow_687: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_687: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb ymm20,ymm20,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_688 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_688 +$L$_small_initial_partial_block_688: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_688: + + or r13,r13 + je NEAR $L$_after_reduction_688 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_688: + jmp NEAR $L$_last_blocks_done_668 +$L$_last_num_blocks_is_11_668: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_689 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_689 + +$L$_16_blocks_overflow_689: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_689: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_690 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_690 +$L$_small_initial_partial_block_690: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_690: + + or r13,r13 + je NEAR $L$_after_reduction_690 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_690: + jmp NEAR $L$_last_blocks_done_668 +$L$_last_num_blocks_is_12_668: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_691 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_691 + +$L$_16_blocks_overflow_691: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_691: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_692 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_692 +$L$_small_initial_partial_block_692: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_692: + + or r13,r13 + je NEAR $L$_after_reduction_692 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_692: + jmp NEAR $L$_last_blocks_done_668 +$L$_last_num_blocks_is_13_668: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_693 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_693 + +$L$_16_blocks_overflow_693: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_693: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb xmm21,xmm21,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_694 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_694 +$L$_small_initial_partial_block_694: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_694: + + or r13,r13 + je NEAR $L$_after_reduction_694 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_694: + jmp NEAR $L$_last_blocks_done_668 +$L$_last_num_blocks_is_14_668: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_695 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_695 + +$L$_16_blocks_overflow_695: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_695: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb ymm21,ymm21,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_696 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_696 +$L$_small_initial_partial_block_696: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_696: + + or r13,r13 + je NEAR $L$_after_reduction_696 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_696: + jmp NEAR $L$_last_blocks_done_668 +$L$_last_num_blocks_is_15_668: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_697 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_697 + +$L$_16_blocks_overflow_697: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_697: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_698 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_698 +$L$_small_initial_partial_block_698: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_698: + + or r13,r13 + je NEAR $L$_after_reduction_698 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_698: + jmp NEAR $L$_last_blocks_done_668 +$L$_last_num_blocks_is_16_668: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_699 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_699 + +$L$_16_blocks_overflow_699: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_699: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_700: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_700: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_700: + jmp NEAR $L$_last_blocks_done_668 +$L$_last_num_blocks_is_0_668: + vmovdqa64 zmm13,ZMMWORD[1024+rsp] + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1088+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1152+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1216+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_668: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_659 +$L$_encrypt_32_blocks_659: + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_701 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_701 +$L$_16_blocks_overflow_701: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_701: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[192+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm26,zmm10,zmm15 + vpxorq zmm24,zmm6,zmm12 + vpxorq zmm25,zmm7,zmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm5 + vpshufb zmm0,zmm17,zmm29 + vpshufb zmm3,zmm19,zmm29 + vpshufb zmm4,zmm20,zmm29 + vpshufb zmm5,zmm21,zmm29 + vmovdqa64 ZMMWORD[1280+rsp],zmm0 + vmovdqa64 ZMMWORD[1344+rsp],zmm3 + vmovdqa64 ZMMWORD[1408+rsp],zmm4 + vmovdqa64 ZMMWORD[1472+rsp],zmm5 + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_702 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_702 +$L$_16_blocks_overflow_702: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_702: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[256+rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[320+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[384+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[448+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[256+r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[320+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[384+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[448+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vpternlogq zmm24,zmm6,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[256+r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[320+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[384+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[448+r11*1+r10],zmm5 + vpshufb zmm0,zmm17,zmm29 + vpshufb zmm3,zmm19,zmm29 + vpshufb zmm4,zmm20,zmm29 + vpshufb zmm5,zmm21,zmm29 + vmovdqa64 ZMMWORD[768+rsp],zmm0 + vmovdqa64 ZMMWORD[832+rsp],zmm3 + vmovdqa64 ZMMWORD[896+rsp],zmm4 + vmovdqa64 ZMMWORD[960+rsp],zmm5 + vmovdqa64 zmm13,ZMMWORD[1280+rsp] + vmovdqu64 zmm12,ZMMWORD[512+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1344+rsp] + vmovdqu64 zmm12,ZMMWORD[576+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1408+rsp] + vmovdqu64 zmm12,ZMMWORD[640+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1472+rsp] + vmovdqu64 zmm12,ZMMWORD[704+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + + sub r13,512 + add r11,512 + mov r10d,r13d + and r10d,~15 + mov ebx,512 + sub ebx,r10d + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_703 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_703 + jb NEAR $L$_last_num_blocks_is_7_1_703 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_703 + jb NEAR $L$_last_num_blocks_is_11_9_703 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_703 + ja NEAR $L$_last_num_blocks_is_16_703 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_703 + jmp NEAR $L$_last_num_blocks_is_13_703 + +$L$_last_num_blocks_is_11_9_703: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_703 + ja NEAR $L$_last_num_blocks_is_11_703 + jmp NEAR $L$_last_num_blocks_is_9_703 + +$L$_last_num_blocks_is_7_1_703: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_703 + jb NEAR $L$_last_num_blocks_is_3_1_703 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_703 + je NEAR $L$_last_num_blocks_is_6_703 + jmp NEAR $L$_last_num_blocks_is_5_703 + +$L$_last_num_blocks_is_3_1_703: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_703 + je NEAR $L$_last_num_blocks_is_2_703 +$L$_last_num_blocks_is_1_703: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_704 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_704 + +$L$_16_blocks_overflow_704: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_704: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc xmm0,xmm0,xmm31 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb xmm17,xmm17,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_705 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_705 +$L$_small_initial_partial_block_705: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm0,XMMWORD[POLY2] + + + vpclmulqdq xmm3,xmm0,xmm25,0x01 + vpslldq xmm3,xmm3,8 + vpxorq xmm3,xmm25,xmm3 + + + vpclmulqdq xmm4,xmm0,xmm3,0x00 + vpsrldq xmm4,xmm4,4 + vpclmulqdq xmm14,xmm0,xmm3,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm4,xmm24,0x96 + + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_705 +$L$_small_initial_compute_done_705: +$L$_after_reduction_705: + jmp NEAR $L$_last_blocks_done_703 +$L$_last_num_blocks_is_2_703: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_706 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_706 + +$L$_16_blocks_overflow_706: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_706: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc ymm0,ymm0,ymm31 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb ymm17,ymm17,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_707 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_707 +$L$_small_initial_partial_block_707: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_707: + + or r13,r13 + je NEAR $L$_after_reduction_707 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_707: + jmp NEAR $L$_last_blocks_done_703 +$L$_last_num_blocks_is_3_703: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_708 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_708 + +$L$_16_blocks_overflow_708: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_708: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_709 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_709 +$L$_small_initial_partial_block_709: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_709: + + or r13,r13 + je NEAR $L$_after_reduction_709 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_709: + jmp NEAR $L$_last_blocks_done_703 +$L$_last_num_blocks_is_4_703: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_710 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_710 + +$L$_16_blocks_overflow_710: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_710: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_711 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_711 +$L$_small_initial_partial_block_711: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_711: + + or r13,r13 + je NEAR $L$_after_reduction_711 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_711: + jmp NEAR $L$_last_blocks_done_703 +$L$_last_num_blocks_is_5_703: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_712 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_712 + +$L$_16_blocks_overflow_712: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_712: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb xmm19,xmm19,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_713 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_713 +$L$_small_initial_partial_block_713: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_713: + + or r13,r13 + je NEAR $L$_after_reduction_713 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_713: + jmp NEAR $L$_last_blocks_done_703 +$L$_last_num_blocks_is_6_703: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_714 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_714 + +$L$_16_blocks_overflow_714: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_714: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb ymm19,ymm19,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_715 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_715 +$L$_small_initial_partial_block_715: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_715: + + or r13,r13 + je NEAR $L$_after_reduction_715 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_715: + jmp NEAR $L$_last_blocks_done_703 +$L$_last_num_blocks_is_7_703: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_716 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_716 + +$L$_16_blocks_overflow_716: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_716: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_717 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_717 +$L$_small_initial_partial_block_717: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_717: + + or r13,r13 + je NEAR $L$_after_reduction_717 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_717: + jmp NEAR $L$_last_blocks_done_703 +$L$_last_num_blocks_is_8_703: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_718 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_718 + +$L$_16_blocks_overflow_718: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_718: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_719 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_719 +$L$_small_initial_partial_block_719: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_719: + + or r13,r13 + je NEAR $L$_after_reduction_719 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_719: + jmp NEAR $L$_last_blocks_done_703 +$L$_last_num_blocks_is_9_703: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_720 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_720 + +$L$_16_blocks_overflow_720: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_720: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb xmm20,xmm20,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_721 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_721 +$L$_small_initial_partial_block_721: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_721: + + or r13,r13 + je NEAR $L$_after_reduction_721 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_721: + jmp NEAR $L$_last_blocks_done_703 +$L$_last_num_blocks_is_10_703: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_722 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_722 + +$L$_16_blocks_overflow_722: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_722: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb ymm20,ymm20,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_723 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_723 +$L$_small_initial_partial_block_723: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_723: + + or r13,r13 + je NEAR $L$_after_reduction_723 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_723: + jmp NEAR $L$_last_blocks_done_703 +$L$_last_num_blocks_is_11_703: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_724 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_724 + +$L$_16_blocks_overflow_724: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_724: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_725 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_725 +$L$_small_initial_partial_block_725: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_725: + + or r13,r13 + je NEAR $L$_after_reduction_725 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_725: + jmp NEAR $L$_last_blocks_done_703 +$L$_last_num_blocks_is_12_703: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_726 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_726 + +$L$_16_blocks_overflow_726: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_726: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_727 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_727 +$L$_small_initial_partial_block_727: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_727: + + or r13,r13 + je NEAR $L$_after_reduction_727 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_727: + jmp NEAR $L$_last_blocks_done_703 +$L$_last_num_blocks_is_13_703: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_728 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_728 + +$L$_16_blocks_overflow_728: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_728: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb xmm21,xmm21,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_729 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_729 +$L$_small_initial_partial_block_729: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_729: + + or r13,r13 + je NEAR $L$_after_reduction_729 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_729: + jmp NEAR $L$_last_blocks_done_703 +$L$_last_num_blocks_is_14_703: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_730 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_730 + +$L$_16_blocks_overflow_730: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_730: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb ymm21,ymm21,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_731 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_731 +$L$_small_initial_partial_block_731: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_731: + + or r13,r13 + je NEAR $L$_after_reduction_731 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_731: + jmp NEAR $L$_last_blocks_done_703 +$L$_last_num_blocks_is_15_703: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_732 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_732 + +$L$_16_blocks_overflow_732: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_732: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_733 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_733 +$L$_small_initial_partial_block_733: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_733: + + or r13,r13 + je NEAR $L$_after_reduction_733 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_733: + jmp NEAR $L$_last_blocks_done_703 +$L$_last_num_blocks_is_16_703: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_734 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_734 + +$L$_16_blocks_overflow_734: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_734: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_735: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_735: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_735: + jmp NEAR $L$_last_blocks_done_703 +$L$_last_num_blocks_is_0_703: + vmovdqa64 zmm13,ZMMWORD[768+rsp] + vpxorq zmm13,zmm13,zmm14 + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[832+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpxorq zmm26,zmm4,zmm10 + vpxorq zmm24,zmm0,zmm6 + vpxorq zmm25,zmm3,zmm7 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[896+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[960+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_703: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_659 +$L$_encrypt_16_blocks_659: + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_736 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_736 +$L$_16_blocks_overflow_736: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_736: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[192+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm26,zmm10,zmm15 + vpxorq zmm24,zmm6,zmm12 + vpxorq zmm25,zmm7,zmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm5 + vpshufb zmm0,zmm17,zmm29 + vpshufb zmm3,zmm19,zmm29 + vpshufb zmm4,zmm20,zmm29 + vpshufb zmm5,zmm21,zmm29 + vmovdqa64 ZMMWORD[1280+rsp],zmm0 + vmovdqa64 ZMMWORD[1344+rsp],zmm3 + vmovdqa64 ZMMWORD[1408+rsp],zmm4 + vmovdqa64 ZMMWORD[1472+rsp],zmm5 + vmovdqa64 zmm13,ZMMWORD[1024+rsp] + vmovdqu64 zmm12,ZMMWORD[256+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1088+rsp] + vmovdqu64 zmm12,ZMMWORD[320+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1152+rsp] + vmovdqu64 zmm12,ZMMWORD[384+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1216+rsp] + vmovdqu64 zmm12,ZMMWORD[448+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + sub r13,256 + add r11,256 + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_737 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_737 + jb NEAR $L$_last_num_blocks_is_7_1_737 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_737 + jb NEAR $L$_last_num_blocks_is_11_9_737 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_737 + ja NEAR $L$_last_num_blocks_is_16_737 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_737 + jmp NEAR $L$_last_num_blocks_is_13_737 + +$L$_last_num_blocks_is_11_9_737: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_737 + ja NEAR $L$_last_num_blocks_is_11_737 + jmp NEAR $L$_last_num_blocks_is_9_737 + +$L$_last_num_blocks_is_7_1_737: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_737 + jb NEAR $L$_last_num_blocks_is_3_1_737 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_737 + je NEAR $L$_last_num_blocks_is_6_737 + jmp NEAR $L$_last_num_blocks_is_5_737 + +$L$_last_num_blocks_is_3_1_737: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_737 + je NEAR $L$_last_num_blocks_is_2_737 +$L$_last_num_blocks_is_1_737: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_738 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_738 + +$L$_16_blocks_overflow_738: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_738: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc xmm0,xmm0,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc xmm0,xmm0,xmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb xmm17,xmm17,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_739 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_739 +$L$_small_initial_partial_block_739: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_739 +$L$_small_initial_compute_done_739: +$L$_after_reduction_739: + jmp NEAR $L$_last_blocks_done_737 +$L$_last_num_blocks_is_2_737: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_740 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_740 + +$L$_16_blocks_overflow_740: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_740: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc ymm0,ymm0,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc ymm0,ymm0,ymm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb ymm17,ymm17,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_741 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_741 +$L$_small_initial_partial_block_741: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_741: + + or r13,r13 + je NEAR $L$_after_reduction_741 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_741: + jmp NEAR $L$_last_blocks_done_737 +$L$_last_num_blocks_is_3_737: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_742 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_742 + +$L$_16_blocks_overflow_742: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_742: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_743 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_743 +$L$_small_initial_partial_block_743: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_743: + + or r13,r13 + je NEAR $L$_after_reduction_743 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_743: + jmp NEAR $L$_last_blocks_done_737 +$L$_last_num_blocks_is_4_737: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_744 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_744 + +$L$_16_blocks_overflow_744: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_744: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_745 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_745 +$L$_small_initial_partial_block_745: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_745: + + or r13,r13 + je NEAR $L$_after_reduction_745 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_745: + jmp NEAR $L$_last_blocks_done_737 +$L$_last_num_blocks_is_5_737: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_746 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_746 + +$L$_16_blocks_overflow_746: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_746: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb xmm19,xmm19,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_747 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_747 +$L$_small_initial_partial_block_747: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_747: + + or r13,r13 + je NEAR $L$_after_reduction_747 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_747: + jmp NEAR $L$_last_blocks_done_737 +$L$_last_num_blocks_is_6_737: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_748 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_748 + +$L$_16_blocks_overflow_748: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_748: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb ymm19,ymm19,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_749 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_749 +$L$_small_initial_partial_block_749: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_749: + + or r13,r13 + je NEAR $L$_after_reduction_749 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_749: + jmp NEAR $L$_last_blocks_done_737 +$L$_last_num_blocks_is_7_737: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_750 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_750 + +$L$_16_blocks_overflow_750: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_750: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_751 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_751 +$L$_small_initial_partial_block_751: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_751: + + or r13,r13 + je NEAR $L$_after_reduction_751 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_751: + jmp NEAR $L$_last_blocks_done_737 +$L$_last_num_blocks_is_8_737: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_752 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_752 + +$L$_16_blocks_overflow_752: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_752: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_753 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_753 +$L$_small_initial_partial_block_753: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_753: + + or r13,r13 + je NEAR $L$_after_reduction_753 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_753: + jmp NEAR $L$_last_blocks_done_737 +$L$_last_num_blocks_is_9_737: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_754 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_754 + +$L$_16_blocks_overflow_754: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_754: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb xmm20,xmm20,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_755 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_755 +$L$_small_initial_partial_block_755: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_755: + + or r13,r13 + je NEAR $L$_after_reduction_755 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_755: + jmp NEAR $L$_last_blocks_done_737 +$L$_last_num_blocks_is_10_737: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_756 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_756 + +$L$_16_blocks_overflow_756: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_756: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb ymm20,ymm20,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_757 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_757 +$L$_small_initial_partial_block_757: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_757: + + or r13,r13 + je NEAR $L$_after_reduction_757 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_757: + jmp NEAR $L$_last_blocks_done_737 +$L$_last_num_blocks_is_11_737: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_758 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_758 + +$L$_16_blocks_overflow_758: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_758: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_759 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_759 +$L$_small_initial_partial_block_759: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_759: + + or r13,r13 + je NEAR $L$_after_reduction_759 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_759: + jmp NEAR $L$_last_blocks_done_737 +$L$_last_num_blocks_is_12_737: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_760 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_760 + +$L$_16_blocks_overflow_760: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_760: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_761 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_761 +$L$_small_initial_partial_block_761: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_761: + + or r13,r13 + je NEAR $L$_after_reduction_761 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_761: + jmp NEAR $L$_last_blocks_done_737 +$L$_last_num_blocks_is_13_737: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_762 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_762 + +$L$_16_blocks_overflow_762: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_762: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb xmm21,xmm21,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_763 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_763 +$L$_small_initial_partial_block_763: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_763: + + or r13,r13 + je NEAR $L$_after_reduction_763 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_763: + jmp NEAR $L$_last_blocks_done_737 +$L$_last_num_blocks_is_14_737: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_764 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_764 + +$L$_16_blocks_overflow_764: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_764: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb ymm21,ymm21,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_765 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_765 +$L$_small_initial_partial_block_765: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_765: + + or r13,r13 + je NEAR $L$_after_reduction_765 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_765: + jmp NEAR $L$_last_blocks_done_737 +$L$_last_num_blocks_is_15_737: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_766 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_766 + +$L$_16_blocks_overflow_766: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_766: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_767 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_767 +$L$_small_initial_partial_block_767: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_767: + + or r13,r13 + je NEAR $L$_after_reduction_767 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_767: + jmp NEAR $L$_last_blocks_done_737 +$L$_last_num_blocks_is_16_737: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_768 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_768 + +$L$_16_blocks_overflow_768: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_768: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_769: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_769: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_769: + jmp NEAR $L$_last_blocks_done_737 +$L$_last_num_blocks_is_0_737: + vmovdqa64 zmm13,ZMMWORD[1280+rsp] + vmovdqu64 zmm12,ZMMWORD[512+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1344+rsp] + vmovdqu64 zmm12,ZMMWORD[576+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1408+rsp] + vmovdqu64 zmm12,ZMMWORD[640+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1472+rsp] + vmovdqu64 zmm12,ZMMWORD[704+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_737: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_659 + +$L$_message_below_32_blocks_659: + + + sub r13,256 + add r11,256 + mov r10d,r13d + test r14,r14 + jnz NEAR $L$_skip_hkeys_precomputation_770 + vmovdqu64 zmm3,ZMMWORD[640+rsp] + + + vshufi64x2 zmm3,zmm3,zmm3,0x00 + + vmovdqu64 zmm4,ZMMWORD[576+rsp] + vmovdqu64 zmm5,ZMMWORD[512+rsp] + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[448+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[384+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[320+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[256+rsp],zmm5 +$L$_skip_hkeys_precomputation_770: + mov r14,1 + and r10d,~15 + mov ebx,512 + sub ebx,r10d + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_771 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_771 + jb NEAR $L$_last_num_blocks_is_7_1_771 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_771 + jb NEAR $L$_last_num_blocks_is_11_9_771 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_771 + ja NEAR $L$_last_num_blocks_is_16_771 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_771 + jmp NEAR $L$_last_num_blocks_is_13_771 + +$L$_last_num_blocks_is_11_9_771: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_771 + ja NEAR $L$_last_num_blocks_is_11_771 + jmp NEAR $L$_last_num_blocks_is_9_771 + +$L$_last_num_blocks_is_7_1_771: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_771 + jb NEAR $L$_last_num_blocks_is_3_1_771 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_771 + je NEAR $L$_last_num_blocks_is_6_771 + jmp NEAR $L$_last_num_blocks_is_5_771 + +$L$_last_num_blocks_is_3_1_771: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_771 + je NEAR $L$_last_num_blocks_is_2_771 +$L$_last_num_blocks_is_1_771: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_772 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_772 + +$L$_16_blocks_overflow_772: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_772: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc xmm0,xmm0,xmm31 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb xmm17,xmm17,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_773 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_773 +$L$_small_initial_partial_block_773: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm0,XMMWORD[POLY2] + + + vpclmulqdq xmm3,xmm0,xmm25,0x01 + vpslldq xmm3,xmm3,8 + vpxorq xmm3,xmm25,xmm3 + + + vpclmulqdq xmm4,xmm0,xmm3,0x00 + vpsrldq xmm4,xmm4,4 + vpclmulqdq xmm14,xmm0,xmm3,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm4,xmm24,0x96 + + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_773 +$L$_small_initial_compute_done_773: +$L$_after_reduction_773: + jmp NEAR $L$_last_blocks_done_771 +$L$_last_num_blocks_is_2_771: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_774 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_774 + +$L$_16_blocks_overflow_774: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_774: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc ymm0,ymm0,ymm31 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb ymm17,ymm17,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_775 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_775 +$L$_small_initial_partial_block_775: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_775: + + or r13,r13 + je NEAR $L$_after_reduction_775 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_775: + jmp NEAR $L$_last_blocks_done_771 +$L$_last_num_blocks_is_3_771: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_776 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_776 + +$L$_16_blocks_overflow_776: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_776: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_777 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_777 +$L$_small_initial_partial_block_777: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_777: + + or r13,r13 + je NEAR $L$_after_reduction_777 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_777: + jmp NEAR $L$_last_blocks_done_771 +$L$_last_num_blocks_is_4_771: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_778 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_778 + +$L$_16_blocks_overflow_778: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_778: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_779 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_779 +$L$_small_initial_partial_block_779: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_779: + + or r13,r13 + je NEAR $L$_after_reduction_779 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_779: + jmp NEAR $L$_last_blocks_done_771 +$L$_last_num_blocks_is_5_771: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_780 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_780 + +$L$_16_blocks_overflow_780: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_780: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb xmm19,xmm19,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_781 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_781 +$L$_small_initial_partial_block_781: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_781: + + or r13,r13 + je NEAR $L$_after_reduction_781 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_781: + jmp NEAR $L$_last_blocks_done_771 +$L$_last_num_blocks_is_6_771: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_782 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_782 + +$L$_16_blocks_overflow_782: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_782: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb ymm19,ymm19,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_783 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_783 +$L$_small_initial_partial_block_783: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_783: + + or r13,r13 + je NEAR $L$_after_reduction_783 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_783: + jmp NEAR $L$_last_blocks_done_771 +$L$_last_num_blocks_is_7_771: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_784 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_784 + +$L$_16_blocks_overflow_784: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_784: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_785 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_785 +$L$_small_initial_partial_block_785: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_785: + + or r13,r13 + je NEAR $L$_after_reduction_785 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_785: + jmp NEAR $L$_last_blocks_done_771 +$L$_last_num_blocks_is_8_771: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_786 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_786 + +$L$_16_blocks_overflow_786: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_786: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_787 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_787 +$L$_small_initial_partial_block_787: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_787: + + or r13,r13 + je NEAR $L$_after_reduction_787 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_787: + jmp NEAR $L$_last_blocks_done_771 +$L$_last_num_blocks_is_9_771: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_788 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_788 + +$L$_16_blocks_overflow_788: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_788: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb xmm20,xmm20,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_789 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_789 +$L$_small_initial_partial_block_789: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_789: + + or r13,r13 + je NEAR $L$_after_reduction_789 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_789: + jmp NEAR $L$_last_blocks_done_771 +$L$_last_num_blocks_is_10_771: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_790 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_790 + +$L$_16_blocks_overflow_790: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_790: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb ymm20,ymm20,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_791 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_791 +$L$_small_initial_partial_block_791: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_791: + + or r13,r13 + je NEAR $L$_after_reduction_791 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_791: + jmp NEAR $L$_last_blocks_done_771 +$L$_last_num_blocks_is_11_771: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_792 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_792 + +$L$_16_blocks_overflow_792: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_792: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_793 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_793 +$L$_small_initial_partial_block_793: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_793: + + or r13,r13 + je NEAR $L$_after_reduction_793 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_793: + jmp NEAR $L$_last_blocks_done_771 +$L$_last_num_blocks_is_12_771: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_794 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_794 + +$L$_16_blocks_overflow_794: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_794: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_795 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_795 +$L$_small_initial_partial_block_795: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_795: + + or r13,r13 + je NEAR $L$_after_reduction_795 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_795: + jmp NEAR $L$_last_blocks_done_771 +$L$_last_num_blocks_is_13_771: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_796 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_796 + +$L$_16_blocks_overflow_796: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_796: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb xmm21,xmm21,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_797 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_797 +$L$_small_initial_partial_block_797: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_797: + + or r13,r13 + je NEAR $L$_after_reduction_797 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_797: + jmp NEAR $L$_last_blocks_done_771 +$L$_last_num_blocks_is_14_771: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_798 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_798 + +$L$_16_blocks_overflow_798: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_798: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb ymm21,ymm21,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_799 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_799 +$L$_small_initial_partial_block_799: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_799: + + or r13,r13 + je NEAR $L$_after_reduction_799 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_799: + jmp NEAR $L$_last_blocks_done_771 +$L$_last_num_blocks_is_15_771: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_800 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_800 + +$L$_16_blocks_overflow_800: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_800: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_801 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_801 +$L$_small_initial_partial_block_801: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_801: + + or r13,r13 + je NEAR $L$_after_reduction_801 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_801: + jmp NEAR $L$_last_blocks_done_771 +$L$_last_num_blocks_is_16_771: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_802 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_802 + +$L$_16_blocks_overflow_802: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_802: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_803: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_803: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_803: + jmp NEAR $L$_last_blocks_done_771 +$L$_last_num_blocks_is_0_771: + vmovdqa64 zmm13,ZMMWORD[768+rsp] + vpxorq zmm13,zmm13,zmm14 + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[832+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpxorq zmm26,zmm4,zmm10 + vpxorq zmm24,zmm0,zmm6 + vpxorq zmm25,zmm3,zmm7 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[896+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[960+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_771: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_659 + +$L$_message_below_equal_16_blocks_659: + + + mov r12d,r13d + add r12d,15 + shr r12d,4 + cmp r12,8 + je NEAR $L$_small_initial_num_blocks_is_8_804 + jl NEAR $L$_small_initial_num_blocks_is_7_1_804 + + + cmp r12,12 + je NEAR $L$_small_initial_num_blocks_is_12_804 + jl NEAR $L$_small_initial_num_blocks_is_11_9_804 + + + cmp r12,16 + je NEAR $L$_small_initial_num_blocks_is_16_804 + cmp r12,15 + je NEAR $L$_small_initial_num_blocks_is_15_804 + cmp r12,14 + je NEAR $L$_small_initial_num_blocks_is_14_804 + jmp NEAR $L$_small_initial_num_blocks_is_13_804 + +$L$_small_initial_num_blocks_is_11_9_804: + + cmp r12,11 + je NEAR $L$_small_initial_num_blocks_is_11_804 + cmp r12,10 + je NEAR $L$_small_initial_num_blocks_is_10_804 + jmp NEAR $L$_small_initial_num_blocks_is_9_804 + +$L$_small_initial_num_blocks_is_7_1_804: + cmp r12,4 + je NEAR $L$_small_initial_num_blocks_is_4_804 + jl NEAR $L$_small_initial_num_blocks_is_3_1_804 + + cmp r12,7 + je NEAR $L$_small_initial_num_blocks_is_7_804 + cmp r12,6 + je NEAR $L$_small_initial_num_blocks_is_6_804 + jmp NEAR $L$_small_initial_num_blocks_is_5_804 + +$L$_small_initial_num_blocks_is_3_1_804: + + cmp r12,3 + je NEAR $L$_small_initial_num_blocks_is_3_804 + cmp r12,2 + je NEAR $L$_small_initial_num_blocks_is_2_804 + + + + + +$L$_small_initial_num_blocks_is_1_804: + vmovdqa64 xmm29,XMMWORD[SHUF_MASK] + vpaddd xmm0,xmm2,XMMWORD[ONE] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,0 + vpshufb xmm0,xmm0,xmm29 + vmovdqu8 xmm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast xmm0,xmm0,xmm15 + vpxorq xmm0,xmm0,xmm6 + vextracti32x4 xmm12,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb xmm6,xmm6,xmm29 + vextracti32x4 xmm13,zmm6,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_805 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm6,xmm20,0x01 + vpclmulqdq xmm5,xmm6,xmm20,0x10 + vpclmulqdq xmm0,xmm6,xmm20,0x11 + vpclmulqdq xmm3,xmm6,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_805 +$L$_small_initial_partial_block_805: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm13 + + jmp NEAR $L$_after_reduction_805 +$L$_small_initial_compute_done_805: +$L$_after_reduction_805: + jmp NEAR $L$_small_initial_blocks_encrypted_804 +$L$_small_initial_num_blocks_is_2_804: + vmovdqa64 ymm29,YMMWORD[SHUF_MASK] + vshufi64x2 ymm0,ymm2,ymm2,0 + vpaddd ymm0,ymm0,YMMWORD[ddq_add_1234] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,1 + vpshufb ymm0,ymm0,ymm29 + vmovdqu8 ymm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast ymm0,ymm0,ymm15 + vpxorq ymm0,ymm0,ymm6 + vextracti32x4 xmm12,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb ymm6,ymm6,ymm29 + vextracti32x4 xmm13,zmm6,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_806 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm6,ymm20,0x01 + vpclmulqdq ymm5,ymm6,ymm20,0x10 + vpclmulqdq ymm0,ymm6,ymm20,0x11 + vpclmulqdq ymm3,ymm6,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_806 +$L$_small_initial_partial_block_806: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm6,xmm20,0x01 + vpclmulqdq xmm5,xmm6,xmm20,0x10 + vpclmulqdq xmm0,xmm6,xmm20,0x11 + vpclmulqdq xmm3,xmm6,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_806: + + or r13,r13 + je NEAR $L$_after_reduction_806 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_806: + jmp NEAR $L$_small_initial_blocks_encrypted_804 +$L$_small_initial_num_blocks_is_3_804: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,2 + vpshufb zmm0,zmm0,zmm29 + vmovdqu8 zmm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vpxorq zmm0,zmm0,zmm6 + vextracti32x4 xmm12,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm6,zmm6,zmm29 + vextracti32x4 xmm13,zmm6,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_807 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_807 +$L$_small_initial_partial_block_807: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm6,ymm20,0x01 + vpclmulqdq ymm5,ymm6,ymm20,0x10 + vpclmulqdq ymm0,ymm6,ymm20,0x11 + vpclmulqdq ymm3,ymm6,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_807: + + or r13,r13 + je NEAR $L$_after_reduction_807 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_807: + jmp NEAR $L$_small_initial_blocks_encrypted_804 +$L$_small_initial_num_blocks_is_4_804: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,3 + vpshufb zmm0,zmm0,zmm29 + vmovdqu8 zmm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vpxorq zmm0,zmm0,zmm6 + vextracti32x4 xmm12,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm6,zmm6,zmm29 + vextracti32x4 xmm13,zmm6,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_808 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_808 +$L$_small_initial_partial_block_808: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_808: + + or r13,r13 + je NEAR $L$_after_reduction_808 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_808: + jmp NEAR $L$_small_initial_blocks_encrypted_804 +$L$_small_initial_num_blocks_is_5_804: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,0 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 xmm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast xmm3,xmm3,xmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq xmm3,xmm3,xmm7 + vextracti32x4 xmm12,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm6,zmm29 + vpshufb xmm7,xmm7,xmm29 + vextracti32x4 xmm13,zmm7,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_809 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm7,xmm20,0x01 + vpclmulqdq xmm5,xmm7,xmm20,0x10 + vpclmulqdq xmm0,xmm7,xmm20,0x11 + vpclmulqdq xmm3,xmm7,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_809 +$L$_small_initial_partial_block_809: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_809: + + or r13,r13 + je NEAR $L$_after_reduction_809 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_809: + jmp NEAR $L$_small_initial_blocks_encrypted_804 +$L$_small_initial_num_blocks_is_6_804: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,1 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 ymm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast ymm3,ymm3,ymm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq ymm3,ymm3,ymm7 + vextracti32x4 xmm12,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm6,zmm29 + vpshufb ymm7,ymm7,ymm29 + vextracti32x4 xmm13,zmm7,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_810 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm7,ymm20,0x01 + vpclmulqdq ymm5,ymm7,ymm20,0x10 + vpclmulqdq ymm0,ymm7,ymm20,0x11 + vpclmulqdq ymm3,ymm7,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_810 +$L$_small_initial_partial_block_810: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm7,xmm20,0x01 + vpclmulqdq xmm5,xmm7,xmm20,0x10 + vpclmulqdq xmm0,xmm7,xmm20,0x11 + vpclmulqdq xmm3,xmm7,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_810: + + or r13,r13 + je NEAR $L$_after_reduction_810 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_810: + jmp NEAR $L$_small_initial_blocks_encrypted_804 +$L$_small_initial_num_blocks_is_7_804: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,2 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vextracti32x4 xmm12,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vextracti32x4 xmm13,zmm7,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_811 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm7,zmm20,0x01 + vpclmulqdq zmm5,zmm7,zmm20,0x10 + vpclmulqdq zmm0,zmm7,zmm20,0x11 + vpclmulqdq zmm3,zmm7,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_811 +$L$_small_initial_partial_block_811: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm7,ymm20,0x01 + vpclmulqdq ymm5,ymm7,ymm20,0x10 + vpclmulqdq ymm0,ymm7,ymm20,0x11 + vpclmulqdq ymm3,ymm7,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_811: + + or r13,r13 + je NEAR $L$_after_reduction_811 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_811: + jmp NEAR $L$_small_initial_blocks_encrypted_804 +$L$_small_initial_num_blocks_is_8_804: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,3 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vextracti32x4 xmm12,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vextracti32x4 xmm13,zmm7,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_812 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_812 +$L$_small_initial_partial_block_812: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm7,zmm20,0x01 + vpclmulqdq zmm5,zmm7,zmm20,0x10 + vpclmulqdq zmm0,zmm7,zmm20,0x11 + vpclmulqdq zmm3,zmm7,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_812: + + or r13,r13 + je NEAR $L$_after_reduction_812 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_812: + jmp NEAR $L$_small_initial_blocks_encrypted_804 +$L$_small_initial_num_blocks_is_9_804: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,0 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast xmm4,xmm4,xmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq xmm4,xmm4,xmm10 + vextracti32x4 xmm12,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb xmm10,xmm10,xmm29 + vextracti32x4 xmm13,zmm10,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_813 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm10,xmm20,0x01 + vpclmulqdq xmm5,xmm10,xmm20,0x10 + vpclmulqdq xmm0,xmm10,xmm20,0x11 + vpclmulqdq xmm3,xmm10,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_813 +$L$_small_initial_partial_block_813: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_813: + + or r13,r13 + je NEAR $L$_after_reduction_813 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_813: + jmp NEAR $L$_small_initial_blocks_encrypted_804 +$L$_small_initial_num_blocks_is_10_804: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,1 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast ymm4,ymm4,ymm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq ymm4,ymm4,ymm10 + vextracti32x4 xmm12,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb ymm10,ymm10,ymm29 + vextracti32x4 xmm13,zmm10,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_814 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm10,ymm20,0x01 + vpclmulqdq ymm5,ymm10,ymm20,0x10 + vpclmulqdq ymm0,ymm10,ymm20,0x11 + vpclmulqdq ymm3,ymm10,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_814 +$L$_small_initial_partial_block_814: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm10,xmm20,0x01 + vpclmulqdq xmm5,xmm10,xmm20,0x10 + vpclmulqdq xmm0,xmm10,xmm20,0x11 + vpclmulqdq xmm3,xmm10,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_814: + + or r13,r13 + je NEAR $L$_after_reduction_814 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_814: + jmp NEAR $L$_small_initial_blocks_encrypted_804 +$L$_small_initial_num_blocks_is_11_804: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,2 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vextracti32x4 xmm12,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vextracti32x4 xmm13,zmm10,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_815 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm10,zmm20,0x01 + vpclmulqdq zmm5,zmm10,zmm20,0x10 + vpclmulqdq zmm0,zmm10,zmm20,0x11 + vpclmulqdq zmm3,zmm10,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_815 +$L$_small_initial_partial_block_815: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm10,ymm20,0x01 + vpclmulqdq ymm5,ymm10,ymm20,0x10 + vpclmulqdq ymm0,ymm10,ymm20,0x11 + vpclmulqdq ymm3,ymm10,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_815: + + or r13,r13 + je NEAR $L$_after_reduction_815 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_815: + jmp NEAR $L$_small_initial_blocks_encrypted_804 +$L$_small_initial_num_blocks_is_12_804: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,3 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vextracti32x4 xmm12,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vextracti32x4 xmm13,zmm10,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_816 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_816 +$L$_small_initial_partial_block_816: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm10,zmm20,0x01 + vpclmulqdq zmm5,zmm10,zmm20,0x10 + vpclmulqdq zmm0,zmm10,zmm20,0x11 + vpclmulqdq zmm3,zmm10,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_816: + + or r13,r13 + je NEAR $L$_after_reduction_816 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_816: + jmp NEAR $L$_small_initial_blocks_encrypted_804 +$L$_small_initial_num_blocks_is_13_804: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,0 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast xmm5,xmm5,xmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq xmm5,xmm5,xmm11 + vextracti32x4 xmm12,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb xmm11,xmm11,xmm29 + vextracti32x4 xmm13,zmm11,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_817 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm11,xmm20,0x01 + vpclmulqdq xmm5,xmm11,xmm20,0x10 + vpclmulqdq xmm0,xmm11,xmm20,0x11 + vpclmulqdq xmm3,xmm11,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_817 +$L$_small_initial_partial_block_817: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_817: + + or r13,r13 + je NEAR $L$_after_reduction_817 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_817: + jmp NEAR $L$_small_initial_blocks_encrypted_804 +$L$_small_initial_num_blocks_is_14_804: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,1 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast ymm5,ymm5,ymm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq ymm5,ymm5,ymm11 + vextracti32x4 xmm12,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb ymm11,ymm11,ymm29 + vextracti32x4 xmm13,zmm11,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_818 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm11,ymm20,0x01 + vpclmulqdq ymm5,ymm11,ymm20,0x10 + vpclmulqdq ymm0,ymm11,ymm20,0x11 + vpclmulqdq ymm3,ymm11,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_818 +$L$_small_initial_partial_block_818: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm11,xmm20,0x01 + vpclmulqdq xmm5,xmm11,xmm20,0x10 + vpclmulqdq xmm0,xmm11,xmm20,0x11 + vpclmulqdq xmm3,xmm11,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_818: + + or r13,r13 + je NEAR $L$_after_reduction_818 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_818: + jmp NEAR $L$_small_initial_blocks_encrypted_804 +$L$_small_initial_num_blocks_is_15_804: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,2 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast zmm5,zmm5,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq zmm5,zmm5,zmm11 + vextracti32x4 xmm12,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vextracti32x4 xmm13,zmm11,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_819 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm11,zmm20,0x01 + vpclmulqdq zmm5,zmm11,zmm20,0x10 + vpclmulqdq zmm0,zmm11,zmm20,0x11 + vpclmulqdq zmm3,zmm11,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_819 +$L$_small_initial_partial_block_819: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm11,ymm20,0x01 + vpclmulqdq ymm5,ymm11,ymm20,0x10 + vpclmulqdq ymm0,ymm11,ymm20,0x11 + vpclmulqdq ymm3,ymm11,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_819: + + or r13,r13 + je NEAR $L$_after_reduction_819 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_819: + jmp NEAR $L$_small_initial_blocks_encrypted_804 +$L$_small_initial_num_blocks_is_16_804: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,3 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast zmm5,zmm5,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq zmm5,zmm5,zmm11 + vextracti32x4 xmm12,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vextracti32x4 xmm13,zmm11,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_820: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm11,zmm20,0x01 + vpclmulqdq zmm5,zmm11,zmm20,0x10 + vpclmulqdq zmm0,zmm11,zmm20,0x11 + vpclmulqdq zmm3,zmm11,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_820: + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_820: +$L$_small_initial_blocks_encrypted_804: +$L$_ghash_done_659: + vmovdqu64 XMMWORD[rdx],xmm2 + vmovdqu64 XMMWORD[64+rdx],xmm14 +$L$_enc_dec_done_659: + jmp NEAR $L$exit_gcm_decrypt +ALIGN 32 +$L$aes_gcm_decrypt_256_avx512: + cmp QWORD[112+rbp],0 + je NEAR $L$_enc_dec_done_821 + xor r14,r14 + vmovdqu64 xmm14,XMMWORD[64+rdx] + + mov r11,QWORD[r8] + or r11,r11 + je NEAR $L$_partial_block_done_822 + mov r10d,16 + lea r12,[byte_len_to_mask_table] + cmp QWORD[112+rbp],r10 + cmovc r10,QWORD[112+rbp] + add r12,r10 + add r12,r10 + kmovw k1,[r12] + vmovdqu8 xmm0{k1}{z},[r9] + + vmovdqu64 xmm3,XMMWORD[16+rdx] + vmovdqu64 xmm4,XMMWORD[336+rdx] + + + + lea r12,[SHIFT_MASK] + add r12,r11 + vmovdqu64 xmm5,XMMWORD[r12] + vpshufb xmm3,xmm3,xmm5 + + vmovdqa64 xmm6,xmm0 + vpxorq xmm3,xmm3,xmm0 + + + mov r13,QWORD[112+rbp] + add r13,r11 + sub r13,16 + jge NEAR $L$_no_extra_mask_822 + sub r12,r13 +$L$_no_extra_mask_822: + + + + vmovdqu64 xmm0,XMMWORD[16+r12] + vpand xmm3,xmm3,xmm0 + vpand xmm6,xmm6,xmm0 + vpshufb xmm6,xmm6,XMMWORD[SHUF_MASK] + vpshufb xmm6,xmm6,xmm5 + vpxorq xmm14,xmm14,xmm6 + cmp r13,0 + jl NEAR $L$_partial_incomplete_822 + + vpclmulqdq xmm7,xmm14,xmm4,0x11 + vpclmulqdq xmm10,xmm14,xmm4,0x00 + vpclmulqdq xmm11,xmm14,xmm4,0x01 + vpclmulqdq xmm14,xmm14,xmm4,0x10 + vpxorq xmm14,xmm14,xmm11 + + vpsrldq xmm11,xmm14,8 + vpslldq xmm14,xmm14,8 + vpxorq xmm7,xmm7,xmm11 + vpxorq xmm14,xmm14,xmm10 + + + + vmovdqu64 xmm11,XMMWORD[POLY2] + + vpclmulqdq xmm10,xmm11,xmm14,0x01 + vpslldq xmm10,xmm10,8 + vpxorq xmm14,xmm14,xmm10 + + + + vpclmulqdq xmm10,xmm11,xmm14,0x00 + vpsrldq xmm10,xmm10,4 + vpclmulqdq xmm14,xmm11,xmm14,0x10 + vpslldq xmm14,xmm14,4 + + vpternlogq xmm14,xmm7,xmm10,0x96 + + mov QWORD[r8],0 + + mov r12,r11 + mov r11,16 + sub r11,r12 + jmp NEAR $L$_enc_dec_done_822 + +$L$_partial_incomplete_822: + mov r12,QWORD[112+rbp] + add QWORD[r8],r12 + mov r11,QWORD[112+rbp] + +$L$_enc_dec_done_822: + + + lea r12,[byte_len_to_mask_table] + kmovw k1,[r11*2+r12] + vmovdqu64 XMMWORD[64+rdx],xmm14 + mov r12,QWORD[120+rbp] + vmovdqu8 XMMWORD[r12]{k1},xmm3 +$L$_partial_block_done_822: + vmovdqu64 xmm2,XMMWORD[rdx] + mov r13,QWORD[112+rbp] + sub r13,r11 + je NEAR $L$_enc_dec_done_821 + cmp r13,256 + jbe NEAR $L$_message_below_equal_16_blocks_821 + + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vmovdqa64 zmm27,ZMMWORD[ddq_addbe_4444] + vmovdqa64 zmm28,ZMMWORD[ddq_addbe_1234] + + + + + + + vmovd r15d,xmm2 + and r15d,255 + + vshufi64x2 zmm2,zmm2,zmm2,0 + vpshufb zmm2,zmm2,zmm29 + + + + cmp r15b,240 + jae NEAR $L$_next_16_overflow_823 + vpaddd zmm7,zmm2,zmm28 + vpaddd zmm10,zmm7,zmm27 + vpaddd zmm11,zmm10,zmm27 + vpaddd zmm12,zmm11,zmm27 + jmp NEAR $L$_next_16_ok_823 +$L$_next_16_overflow_823: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm12,ZMMWORD[ddq_add_4444] + vpaddd zmm7,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm10,zmm7,zmm12 + vpaddd zmm11,zmm10,zmm12 + vpaddd zmm12,zmm11,zmm12 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vpshufb zmm12,zmm12,zmm29 +$L$_next_16_ok_823: + vshufi64x2 zmm2,zmm12,zmm12,255 + add r15b,16 + + vmovdqu8 zmm0,ZMMWORD[r11*1+r9] + vmovdqu8 zmm3,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm4,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm5,ZMMWORD[192+r11*1+r9] + + + vbroadcastf64x2 zmm6,ZMMWORD[rcx] + vpxorq zmm7,zmm7,zmm6 + vpxorq zmm10,zmm10,zmm6 + vpxorq zmm11,zmm11,zmm6 + vpxorq zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[16+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[32+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[48+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[64+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[80+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[96+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[112+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[128+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[144+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[160+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[176+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[192+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[208+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[224+rcx] + vaesenclast zmm7,zmm7,zmm6 + vaesenclast zmm10,zmm10,zmm6 + vaesenclast zmm11,zmm11,zmm6 + vaesenclast zmm12,zmm12,zmm6 + + + vpxorq zmm7,zmm7,zmm0 + vpxorq zmm10,zmm10,zmm3 + vpxorq zmm11,zmm11,zmm4 + vpxorq zmm12,zmm12,zmm5 + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm7 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm10 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm11 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm12 + + vpshufb zmm7,zmm0,zmm29 + vpshufb zmm10,zmm3,zmm29 + vpshufb zmm11,zmm4,zmm29 + vpshufb zmm12,zmm5,zmm29 + vmovdqa64 ZMMWORD[768+rsp],zmm7 + vmovdqa64 ZMMWORD[832+rsp],zmm10 + vmovdqa64 ZMMWORD[896+rsp],zmm11 + vmovdqa64 ZMMWORD[960+rsp],zmm12 + test r14,r14 + jnz NEAR $L$_skip_hkeys_precomputation_824 + + vmovdqu64 zmm0,ZMMWORD[288+rdx] + vmovdqu64 ZMMWORD[704+rsp],zmm0 + + vmovdqu64 zmm3,ZMMWORD[224+rdx] + vmovdqu64 ZMMWORD[640+rsp],zmm3 + + + vshufi64x2 zmm3,zmm3,zmm3,0x00 + + vmovdqu64 zmm4,ZMMWORD[160+rdx] + vmovdqu64 ZMMWORD[576+rsp],zmm4 + + vmovdqu64 zmm5,ZMMWORD[96+rdx] + vmovdqu64 ZMMWORD[512+rsp],zmm5 +$L$_skip_hkeys_precomputation_824: + cmp r13,512 + jb NEAR $L$_message_below_32_blocks_821 + + + + cmp r15b,240 + jae NEAR $L$_next_16_overflow_825 + vpaddd zmm7,zmm2,zmm28 + vpaddd zmm10,zmm7,zmm27 + vpaddd zmm11,zmm10,zmm27 + vpaddd zmm12,zmm11,zmm27 + jmp NEAR $L$_next_16_ok_825 +$L$_next_16_overflow_825: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm12,ZMMWORD[ddq_add_4444] + vpaddd zmm7,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm10,zmm7,zmm12 + vpaddd zmm11,zmm10,zmm12 + vpaddd zmm12,zmm11,zmm12 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vpshufb zmm12,zmm12,zmm29 +$L$_next_16_ok_825: + vshufi64x2 zmm2,zmm12,zmm12,255 + add r15b,16 + + vmovdqu8 zmm0,ZMMWORD[256+r11*1+r9] + vmovdqu8 zmm3,ZMMWORD[320+r11*1+r9] + vmovdqu8 zmm4,ZMMWORD[384+r11*1+r9] + vmovdqu8 zmm5,ZMMWORD[448+r11*1+r9] + + + vbroadcastf64x2 zmm6,ZMMWORD[rcx] + vpxorq zmm7,zmm7,zmm6 + vpxorq zmm10,zmm10,zmm6 + vpxorq zmm11,zmm11,zmm6 + vpxorq zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[16+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[32+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[48+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[64+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[80+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[96+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[112+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[128+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[144+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[160+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[176+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[192+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[208+rcx] + vaesenc zmm7,zmm7,zmm6 + vaesenc zmm10,zmm10,zmm6 + vaesenc zmm11,zmm11,zmm6 + vaesenc zmm12,zmm12,zmm6 + vbroadcastf64x2 zmm6,ZMMWORD[224+rcx] + vaesenclast zmm7,zmm7,zmm6 + vaesenclast zmm10,zmm10,zmm6 + vaesenclast zmm11,zmm11,zmm6 + vaesenclast zmm12,zmm12,zmm6 + + + vpxorq zmm7,zmm7,zmm0 + vpxorq zmm10,zmm10,zmm3 + vpxorq zmm11,zmm11,zmm4 + vpxorq zmm12,zmm12,zmm5 + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[256+r11*1+r10],zmm7 + vmovdqu8 ZMMWORD[320+r11*1+r10],zmm10 + vmovdqu8 ZMMWORD[384+r11*1+r10],zmm11 + vmovdqu8 ZMMWORD[448+r11*1+r10],zmm12 + + vpshufb zmm7,zmm0,zmm29 + vpshufb zmm10,zmm3,zmm29 + vpshufb zmm11,zmm4,zmm29 + vpshufb zmm12,zmm5,zmm29 + vmovdqa64 ZMMWORD[1024+rsp],zmm7 + vmovdqa64 ZMMWORD[1088+rsp],zmm10 + vmovdqa64 ZMMWORD[1152+rsp],zmm11 + vmovdqa64 ZMMWORD[1216+rsp],zmm12 + test r14,r14 + jnz NEAR $L$_skip_hkeys_precomputation_826 + vmovdqu64 zmm3,ZMMWORD[640+rsp] + + + vshufi64x2 zmm3,zmm3,zmm3,0x00 + + vmovdqu64 zmm4,ZMMWORD[576+rsp] + vmovdqu64 zmm5,ZMMWORD[512+rsp] + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[448+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[384+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[320+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[256+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[192+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[128+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[64+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[rsp],zmm5 +$L$_skip_hkeys_precomputation_826: + mov r14,1 + add r11,512 + sub r13,512 + + cmp r13,768 + jb NEAR $L$_no_more_big_nblocks_821 +$L$_encrypt_big_nblocks_821: + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_827 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_827 +$L$_16_blocks_overflow_827: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_827: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[192+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm26,zmm10,zmm15 + vpxorq zmm24,zmm6,zmm12 + vpxorq zmm25,zmm7,zmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm5 + vpshufb zmm0,zmm17,zmm29 + vpshufb zmm3,zmm19,zmm29 + vpshufb zmm4,zmm20,zmm29 + vpshufb zmm5,zmm21,zmm29 + vmovdqa64 ZMMWORD[1280+rsp],zmm0 + vmovdqa64 ZMMWORD[1344+rsp],zmm3 + vmovdqa64 ZMMWORD[1408+rsp],zmm4 + vmovdqa64 ZMMWORD[1472+rsp],zmm5 + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_828 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_828 +$L$_16_blocks_overflow_828: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_828: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[256+rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[320+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[384+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[448+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[256+r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[320+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[384+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[448+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vpternlogq zmm24,zmm6,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[256+r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[320+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[384+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[448+r11*1+r10],zmm5 + vpshufb zmm0,zmm17,zmm29 + vpshufb zmm3,zmm19,zmm29 + vpshufb zmm4,zmm20,zmm29 + vpshufb zmm5,zmm21,zmm29 + vmovdqa64 ZMMWORD[768+rsp],zmm0 + vmovdqa64 ZMMWORD[832+rsp],zmm3 + vmovdqa64 ZMMWORD[896+rsp],zmm4 + vmovdqa64 ZMMWORD[960+rsp],zmm5 + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_829 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_829 +$L$_16_blocks_overflow_829: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_829: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[512+r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[576+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[640+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[704+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + + + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpternlogq zmm6,zmm12,zmm15,0x96 + vpxorq zmm6,zmm6,zmm24 + vpternlogq zmm7,zmm13,zmm10,0x96 + vpxorq zmm7,zmm7,zmm25 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vextracti64x4 ymm12,zmm6,1 + vpxorq ymm6,ymm6,ymm12 + vextracti32x4 xmm12,ymm6,1 + vpxorq xmm6,xmm6,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm6,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[512+r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[576+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[640+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[704+r11*1+r10],zmm5 + vpshufb zmm0,zmm17,zmm29 + vpshufb zmm3,zmm19,zmm29 + vpshufb zmm4,zmm20,zmm29 + vpshufb zmm5,zmm21,zmm29 + vmovdqa64 ZMMWORD[1024+rsp],zmm0 + vmovdqa64 ZMMWORD[1088+rsp],zmm3 + vmovdqa64 ZMMWORD[1152+rsp],zmm4 + vmovdqa64 ZMMWORD[1216+rsp],zmm5 + vmovdqa64 zmm14,zmm6 + + add r11,768 + sub r13,768 + cmp r13,768 + jae NEAR $L$_encrypt_big_nblocks_821 + +$L$_no_more_big_nblocks_821: + + cmp r13,512 + jae NEAR $L$_encrypt_32_blocks_821 + + cmp r13,256 + jae NEAR $L$_encrypt_16_blocks_821 +$L$_encrypt_0_blocks_ghash_32_821: + mov r10d,r13d + and r10d,~15 + mov ebx,256 + sub ebx,r10d + vmovdqa64 zmm13,ZMMWORD[768+rsp] + vpxorq zmm13,zmm13,zmm14 + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[832+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpxorq zmm26,zmm4,zmm10 + vpxorq zmm24,zmm0,zmm6 + vpxorq zmm25,zmm3,zmm7 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[896+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[960+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + add ebx,256 + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_830 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_830 + jb NEAR $L$_last_num_blocks_is_7_1_830 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_830 + jb NEAR $L$_last_num_blocks_is_11_9_830 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_830 + ja NEAR $L$_last_num_blocks_is_16_830 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_830 + jmp NEAR $L$_last_num_blocks_is_13_830 + +$L$_last_num_blocks_is_11_9_830: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_830 + ja NEAR $L$_last_num_blocks_is_11_830 + jmp NEAR $L$_last_num_blocks_is_9_830 + +$L$_last_num_blocks_is_7_1_830: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_830 + jb NEAR $L$_last_num_blocks_is_3_1_830 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_830 + je NEAR $L$_last_num_blocks_is_6_830 + jmp NEAR $L$_last_num_blocks_is_5_830 + +$L$_last_num_blocks_is_3_1_830: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_830 + je NEAR $L$_last_num_blocks_is_2_830 +$L$_last_num_blocks_is_1_830: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_831 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_831 + +$L$_16_blocks_overflow_831: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_831: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc xmm0,xmm0,xmm31 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb xmm17,xmm17,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_832 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_832 +$L$_small_initial_partial_block_832: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm0,XMMWORD[POLY2] + + + vpclmulqdq xmm3,xmm0,xmm25,0x01 + vpslldq xmm3,xmm3,8 + vpxorq xmm3,xmm25,xmm3 + + + vpclmulqdq xmm4,xmm0,xmm3,0x00 + vpsrldq xmm4,xmm4,4 + vpclmulqdq xmm14,xmm0,xmm3,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm4,xmm24,0x96 + + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_832 +$L$_small_initial_compute_done_832: +$L$_after_reduction_832: + jmp NEAR $L$_last_blocks_done_830 +$L$_last_num_blocks_is_2_830: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_833 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_833 + +$L$_16_blocks_overflow_833: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_833: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc ymm0,ymm0,ymm31 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb ymm17,ymm17,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_834 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_834 +$L$_small_initial_partial_block_834: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_834: + + or r13,r13 + je NEAR $L$_after_reduction_834 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_834: + jmp NEAR $L$_last_blocks_done_830 +$L$_last_num_blocks_is_3_830: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_835 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_835 + +$L$_16_blocks_overflow_835: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_835: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_836 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_836 +$L$_small_initial_partial_block_836: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_836: + + or r13,r13 + je NEAR $L$_after_reduction_836 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_836: + jmp NEAR $L$_last_blocks_done_830 +$L$_last_num_blocks_is_4_830: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_837 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_837 + +$L$_16_blocks_overflow_837: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_837: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_838 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_838 +$L$_small_initial_partial_block_838: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_838: + + or r13,r13 + je NEAR $L$_after_reduction_838 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_838: + jmp NEAR $L$_last_blocks_done_830 +$L$_last_num_blocks_is_5_830: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_839 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_839 + +$L$_16_blocks_overflow_839: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_839: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb xmm19,xmm19,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_840 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_840 +$L$_small_initial_partial_block_840: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_840: + + or r13,r13 + je NEAR $L$_after_reduction_840 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_840: + jmp NEAR $L$_last_blocks_done_830 +$L$_last_num_blocks_is_6_830: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_841 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_841 + +$L$_16_blocks_overflow_841: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_841: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb ymm19,ymm19,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_842 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_842 +$L$_small_initial_partial_block_842: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_842: + + or r13,r13 + je NEAR $L$_after_reduction_842 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_842: + jmp NEAR $L$_last_blocks_done_830 +$L$_last_num_blocks_is_7_830: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_843 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_843 + +$L$_16_blocks_overflow_843: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_843: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_844 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_844 +$L$_small_initial_partial_block_844: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_844: + + or r13,r13 + je NEAR $L$_after_reduction_844 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_844: + jmp NEAR $L$_last_blocks_done_830 +$L$_last_num_blocks_is_8_830: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_845 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_845 + +$L$_16_blocks_overflow_845: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_845: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_846 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_846 +$L$_small_initial_partial_block_846: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_846: + + or r13,r13 + je NEAR $L$_after_reduction_846 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_846: + jmp NEAR $L$_last_blocks_done_830 +$L$_last_num_blocks_is_9_830: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_847 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_847 + +$L$_16_blocks_overflow_847: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_847: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb xmm20,xmm20,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_848 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_848 +$L$_small_initial_partial_block_848: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_848: + + or r13,r13 + je NEAR $L$_after_reduction_848 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_848: + jmp NEAR $L$_last_blocks_done_830 +$L$_last_num_blocks_is_10_830: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_849 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_849 + +$L$_16_blocks_overflow_849: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_849: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb ymm20,ymm20,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_850 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_850 +$L$_small_initial_partial_block_850: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_850: + + or r13,r13 + je NEAR $L$_after_reduction_850 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_850: + jmp NEAR $L$_last_blocks_done_830 +$L$_last_num_blocks_is_11_830: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_851 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_851 + +$L$_16_blocks_overflow_851: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_851: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_852 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_852 +$L$_small_initial_partial_block_852: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_852: + + or r13,r13 + je NEAR $L$_after_reduction_852 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_852: + jmp NEAR $L$_last_blocks_done_830 +$L$_last_num_blocks_is_12_830: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_853 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_853 + +$L$_16_blocks_overflow_853: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_853: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_854 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_854 +$L$_small_initial_partial_block_854: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_854: + + or r13,r13 + je NEAR $L$_after_reduction_854 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_854: + jmp NEAR $L$_last_blocks_done_830 +$L$_last_num_blocks_is_13_830: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_855 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_855 + +$L$_16_blocks_overflow_855: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_855: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb xmm21,xmm21,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_856 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_856 +$L$_small_initial_partial_block_856: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_856: + + or r13,r13 + je NEAR $L$_after_reduction_856 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_856: + jmp NEAR $L$_last_blocks_done_830 +$L$_last_num_blocks_is_14_830: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_857 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_857 + +$L$_16_blocks_overflow_857: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_857: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb ymm21,ymm21,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_858 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_858 +$L$_small_initial_partial_block_858: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_858: + + or r13,r13 + je NEAR $L$_after_reduction_858 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_858: + jmp NEAR $L$_last_blocks_done_830 +$L$_last_num_blocks_is_15_830: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_859 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_859 + +$L$_16_blocks_overflow_859: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_859: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_860 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_860 +$L$_small_initial_partial_block_860: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_860: + + or r13,r13 + je NEAR $L$_after_reduction_860 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_860: + jmp NEAR $L$_last_blocks_done_830 +$L$_last_num_blocks_is_16_830: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_861 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_861 + +$L$_16_blocks_overflow_861: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_861: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm24,zmm14,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_862: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_862: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_862: + jmp NEAR $L$_last_blocks_done_830 +$L$_last_num_blocks_is_0_830: + vmovdqa64 zmm13,ZMMWORD[1024+rsp] + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1088+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1152+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1216+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_830: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_821 +$L$_encrypt_32_blocks_821: + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_863 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_863 +$L$_16_blocks_overflow_863: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_863: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[192+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm26,zmm10,zmm15 + vpxorq zmm24,zmm6,zmm12 + vpxorq zmm25,zmm7,zmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm5 + vpshufb zmm0,zmm17,zmm29 + vpshufb zmm3,zmm19,zmm29 + vpshufb zmm4,zmm20,zmm29 + vpshufb zmm5,zmm21,zmm29 + vmovdqa64 ZMMWORD[1280+rsp],zmm0 + vmovdqa64 ZMMWORD[1344+rsp],zmm3 + vmovdqa64 ZMMWORD[1408+rsp],zmm4 + vmovdqa64 ZMMWORD[1472+rsp],zmm5 + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_864 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_864 +$L$_16_blocks_overflow_864: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_864: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1024+rsp] + vmovdqu64 zmm1,ZMMWORD[256+rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[320+rsp] + vmovdqa64 zmm22,ZMMWORD[1088+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[384+rsp] + vmovdqa64 zmm8,ZMMWORD[1152+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[448+rsp] + vmovdqa64 zmm22,ZMMWORD[1216+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[256+r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[320+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[384+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[448+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm26,zmm10,zmm15,0x96 + vpternlogq zmm24,zmm6,zmm12,0x96 + vpternlogq zmm25,zmm7,zmm13,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[256+r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[320+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[384+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[448+r11*1+r10],zmm5 + vpshufb zmm0,zmm17,zmm29 + vpshufb zmm3,zmm19,zmm29 + vpshufb zmm4,zmm20,zmm29 + vpshufb zmm5,zmm21,zmm29 + vmovdqa64 ZMMWORD[768+rsp],zmm0 + vmovdqa64 ZMMWORD[832+rsp],zmm3 + vmovdqa64 ZMMWORD[896+rsp],zmm4 + vmovdqa64 ZMMWORD[960+rsp],zmm5 + vmovdqa64 zmm13,ZMMWORD[1280+rsp] + vmovdqu64 zmm12,ZMMWORD[512+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1344+rsp] + vmovdqu64 zmm12,ZMMWORD[576+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1408+rsp] + vmovdqu64 zmm12,ZMMWORD[640+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1472+rsp] + vmovdqu64 zmm12,ZMMWORD[704+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + + sub r13,512 + add r11,512 + mov r10d,r13d + and r10d,~15 + mov ebx,512 + sub ebx,r10d + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_865 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_865 + jb NEAR $L$_last_num_blocks_is_7_1_865 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_865 + jb NEAR $L$_last_num_blocks_is_11_9_865 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_865 + ja NEAR $L$_last_num_blocks_is_16_865 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_865 + jmp NEAR $L$_last_num_blocks_is_13_865 + +$L$_last_num_blocks_is_11_9_865: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_865 + ja NEAR $L$_last_num_blocks_is_11_865 + jmp NEAR $L$_last_num_blocks_is_9_865 + +$L$_last_num_blocks_is_7_1_865: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_865 + jb NEAR $L$_last_num_blocks_is_3_1_865 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_865 + je NEAR $L$_last_num_blocks_is_6_865 + jmp NEAR $L$_last_num_blocks_is_5_865 + +$L$_last_num_blocks_is_3_1_865: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_865 + je NEAR $L$_last_num_blocks_is_2_865 +$L$_last_num_blocks_is_1_865: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_866 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_866 + +$L$_16_blocks_overflow_866: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_866: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc xmm0,xmm0,xmm31 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb xmm17,xmm17,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_867 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_867 +$L$_small_initial_partial_block_867: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm0,XMMWORD[POLY2] + + + vpclmulqdq xmm3,xmm0,xmm25,0x01 + vpslldq xmm3,xmm3,8 + vpxorq xmm3,xmm25,xmm3 + + + vpclmulqdq xmm4,xmm0,xmm3,0x00 + vpsrldq xmm4,xmm4,4 + vpclmulqdq xmm14,xmm0,xmm3,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm4,xmm24,0x96 + + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_867 +$L$_small_initial_compute_done_867: +$L$_after_reduction_867: + jmp NEAR $L$_last_blocks_done_865 +$L$_last_num_blocks_is_2_865: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_868 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_868 + +$L$_16_blocks_overflow_868: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_868: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc ymm0,ymm0,ymm31 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb ymm17,ymm17,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_869 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_869 +$L$_small_initial_partial_block_869: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_869: + + or r13,r13 + je NEAR $L$_after_reduction_869 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_869: + jmp NEAR $L$_last_blocks_done_865 +$L$_last_num_blocks_is_3_865: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_870 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_870 + +$L$_16_blocks_overflow_870: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_870: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_871 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_871 +$L$_small_initial_partial_block_871: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_871: + + or r13,r13 + je NEAR $L$_after_reduction_871 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_871: + jmp NEAR $L$_last_blocks_done_865 +$L$_last_num_blocks_is_4_865: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_872 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_872 + +$L$_16_blocks_overflow_872: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_872: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_873 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_873 +$L$_small_initial_partial_block_873: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_873: + + or r13,r13 + je NEAR $L$_after_reduction_873 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_873: + jmp NEAR $L$_last_blocks_done_865 +$L$_last_num_blocks_is_5_865: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_874 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_874 + +$L$_16_blocks_overflow_874: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_874: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb xmm19,xmm19,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_875 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_875 +$L$_small_initial_partial_block_875: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_875: + + or r13,r13 + je NEAR $L$_after_reduction_875 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_875: + jmp NEAR $L$_last_blocks_done_865 +$L$_last_num_blocks_is_6_865: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_876 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_876 + +$L$_16_blocks_overflow_876: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_876: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb ymm19,ymm19,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_877 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_877 +$L$_small_initial_partial_block_877: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_877: + + or r13,r13 + je NEAR $L$_after_reduction_877 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_877: + jmp NEAR $L$_last_blocks_done_865 +$L$_last_num_blocks_is_7_865: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_878 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_878 + +$L$_16_blocks_overflow_878: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_878: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_879 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_879 +$L$_small_initial_partial_block_879: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_879: + + or r13,r13 + je NEAR $L$_after_reduction_879 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_879: + jmp NEAR $L$_last_blocks_done_865 +$L$_last_num_blocks_is_8_865: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_880 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_880 + +$L$_16_blocks_overflow_880: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_880: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_881 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_881 +$L$_small_initial_partial_block_881: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_881: + + or r13,r13 + je NEAR $L$_after_reduction_881 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_881: + jmp NEAR $L$_last_blocks_done_865 +$L$_last_num_blocks_is_9_865: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_882 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_882 + +$L$_16_blocks_overflow_882: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_882: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb xmm20,xmm20,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_883 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_883 +$L$_small_initial_partial_block_883: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_883: + + or r13,r13 + je NEAR $L$_after_reduction_883 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_883: + jmp NEAR $L$_last_blocks_done_865 +$L$_last_num_blocks_is_10_865: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_884 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_884 + +$L$_16_blocks_overflow_884: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_884: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb ymm20,ymm20,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_885 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_885 +$L$_small_initial_partial_block_885: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_885: + + or r13,r13 + je NEAR $L$_after_reduction_885 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_885: + jmp NEAR $L$_last_blocks_done_865 +$L$_last_num_blocks_is_11_865: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_886 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_886 + +$L$_16_blocks_overflow_886: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_886: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_887 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_887 +$L$_small_initial_partial_block_887: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_887: + + or r13,r13 + je NEAR $L$_after_reduction_887 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_887: + jmp NEAR $L$_last_blocks_done_865 +$L$_last_num_blocks_is_12_865: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_888 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_888 + +$L$_16_blocks_overflow_888: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_888: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_889 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_889 +$L$_small_initial_partial_block_889: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_889: + + or r13,r13 + je NEAR $L$_after_reduction_889 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_889: + jmp NEAR $L$_last_blocks_done_865 +$L$_last_num_blocks_is_13_865: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_890 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_890 + +$L$_16_blocks_overflow_890: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_890: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb xmm21,xmm21,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_891 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_891 +$L$_small_initial_partial_block_891: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_891: + + or r13,r13 + je NEAR $L$_after_reduction_891 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_891: + jmp NEAR $L$_last_blocks_done_865 +$L$_last_num_blocks_is_14_865: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_892 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_892 + +$L$_16_blocks_overflow_892: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_892: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb ymm21,ymm21,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_893 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_893 +$L$_small_initial_partial_block_893: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_893: + + or r13,r13 + je NEAR $L$_after_reduction_893 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_893: + jmp NEAR $L$_last_blocks_done_865 +$L$_last_num_blocks_is_15_865: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_894 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_894 + +$L$_16_blocks_overflow_894: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_894: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_895 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_895 +$L$_small_initial_partial_block_895: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_895: + + or r13,r13 + je NEAR $L$_after_reduction_895 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_895: + jmp NEAR $L$_last_blocks_done_865 +$L$_last_num_blocks_is_16_865: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_896 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_896 + +$L$_16_blocks_overflow_896: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_896: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_897: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_897: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_897: + jmp NEAR $L$_last_blocks_done_865 +$L$_last_num_blocks_is_0_865: + vmovdqa64 zmm13,ZMMWORD[768+rsp] + vpxorq zmm13,zmm13,zmm14 + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[832+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpxorq zmm26,zmm4,zmm10 + vpxorq zmm24,zmm0,zmm6 + vpxorq zmm25,zmm3,zmm7 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[896+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[960+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_865: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_821 +$L$_encrypt_16_blocks_821: + cmp r15b,240 + jae NEAR $L$_16_blocks_overflow_898 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_898 +$L$_16_blocks_overflow_898: + vpshufb zmm2,zmm2,zmm29 + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_898: + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rsp] + + + + + vshufi64x2 zmm2,zmm5,zmm5,255 + add r15b,16 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + + + + + + + + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + + vpclmulqdq zmm6,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + + vpternlogq zmm6,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + + + + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21,ZMMWORD[192+r11*1+r9] + + + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm26,zmm10,zmm15 + vpxorq zmm24,zmm6,zmm12 + vpxorq zmm25,zmm7,zmm13 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + + + + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + + + + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10],zmm5 + vpshufb zmm0,zmm17,zmm29 + vpshufb zmm3,zmm19,zmm29 + vpshufb zmm4,zmm20,zmm29 + vpshufb zmm5,zmm21,zmm29 + vmovdqa64 ZMMWORD[1280+rsp],zmm0 + vmovdqa64 ZMMWORD[1344+rsp],zmm3 + vmovdqa64 ZMMWORD[1408+rsp],zmm4 + vmovdqa64 ZMMWORD[1472+rsp],zmm5 + vmovdqa64 zmm13,ZMMWORD[1024+rsp] + vmovdqu64 zmm12,ZMMWORD[256+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1088+rsp] + vmovdqu64 zmm12,ZMMWORD[320+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1152+rsp] + vmovdqu64 zmm12,ZMMWORD[384+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1216+rsp] + vmovdqu64 zmm12,ZMMWORD[448+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + sub r13,256 + add r11,256 + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_899 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_899 + jb NEAR $L$_last_num_blocks_is_7_1_899 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_899 + jb NEAR $L$_last_num_blocks_is_11_9_899 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_899 + ja NEAR $L$_last_num_blocks_is_16_899 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_899 + jmp NEAR $L$_last_num_blocks_is_13_899 + +$L$_last_num_blocks_is_11_9_899: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_899 + ja NEAR $L$_last_num_blocks_is_11_899 + jmp NEAR $L$_last_num_blocks_is_9_899 + +$L$_last_num_blocks_is_7_1_899: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_899 + jb NEAR $L$_last_num_blocks_is_3_1_899 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_899 + je NEAR $L$_last_num_blocks_is_6_899 + jmp NEAR $L$_last_num_blocks_is_5_899 + +$L$_last_num_blocks_is_3_1_899: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_899 + je NEAR $L$_last_num_blocks_is_2_899 +$L$_last_num_blocks_is_1_899: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_900 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_900 + +$L$_16_blocks_overflow_900: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_900: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc xmm0,xmm0,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc xmm0,xmm0,xmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb xmm17,xmm17,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_901 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_901 +$L$_small_initial_partial_block_901: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_901 +$L$_small_initial_compute_done_901: +$L$_after_reduction_901: + jmp NEAR $L$_last_blocks_done_899 +$L$_last_num_blocks_is_2_899: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_902 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_902 + +$L$_16_blocks_overflow_902: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_902: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc ymm0,ymm0,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc ymm0,ymm0,ymm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb ymm17,ymm17,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_903 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_903 +$L$_small_initial_partial_block_903: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_903: + + or r13,r13 + je NEAR $L$_after_reduction_903 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_903: + jmp NEAR $L$_last_blocks_done_899 +$L$_last_num_blocks_is_3_899: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_904 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_904 + +$L$_16_blocks_overflow_904: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_904: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_905 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_905 +$L$_small_initial_partial_block_905: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_905: + + or r13,r13 + je NEAR $L$_after_reduction_905 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_905: + jmp NEAR $L$_last_blocks_done_899 +$L$_last_num_blocks_is_4_899: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_906 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_906 + +$L$_16_blocks_overflow_906: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_906: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_907 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_907 +$L$_small_initial_partial_block_907: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_907: + + or r13,r13 + je NEAR $L$_after_reduction_907 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_907: + jmp NEAR $L$_last_blocks_done_899 +$L$_last_num_blocks_is_5_899: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_908 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_908 + +$L$_16_blocks_overflow_908: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_908: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb xmm19,xmm19,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_909 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_909 +$L$_small_initial_partial_block_909: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_909: + + or r13,r13 + je NEAR $L$_after_reduction_909 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_909: + jmp NEAR $L$_last_blocks_done_899 +$L$_last_num_blocks_is_6_899: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_910 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_910 + +$L$_16_blocks_overflow_910: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_910: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb ymm19,ymm19,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_911 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_911 +$L$_small_initial_partial_block_911: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_911: + + or r13,r13 + je NEAR $L$_after_reduction_911 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_911: + jmp NEAR $L$_last_blocks_done_899 +$L$_last_num_blocks_is_7_899: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_912 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_912 + +$L$_16_blocks_overflow_912: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_912: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_913 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_913 +$L$_small_initial_partial_block_913: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_913: + + or r13,r13 + je NEAR $L$_after_reduction_913 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_913: + jmp NEAR $L$_last_blocks_done_899 +$L$_last_num_blocks_is_8_899: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_914 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_914 + +$L$_16_blocks_overflow_914: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_914: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_915 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_915 +$L$_small_initial_partial_block_915: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_915: + + or r13,r13 + je NEAR $L$_after_reduction_915 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_915: + jmp NEAR $L$_last_blocks_done_899 +$L$_last_num_blocks_is_9_899: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_916 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_916 + +$L$_16_blocks_overflow_916: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_916: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb xmm20,xmm20,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_917 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_917 +$L$_small_initial_partial_block_917: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_917: + + or r13,r13 + je NEAR $L$_after_reduction_917 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_917: + jmp NEAR $L$_last_blocks_done_899 +$L$_last_num_blocks_is_10_899: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_918 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_918 + +$L$_16_blocks_overflow_918: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_918: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb ymm20,ymm20,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_919 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_919 +$L$_small_initial_partial_block_919: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_919: + + or r13,r13 + je NEAR $L$_after_reduction_919 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_919: + jmp NEAR $L$_last_blocks_done_899 +$L$_last_num_blocks_is_11_899: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_920 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_920 + +$L$_16_blocks_overflow_920: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_920: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_921 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_921 +$L$_small_initial_partial_block_921: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_921: + + or r13,r13 + je NEAR $L$_after_reduction_921 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_921: + jmp NEAR $L$_last_blocks_done_899 +$L$_last_num_blocks_is_12_899: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_922 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_922 + +$L$_16_blocks_overflow_922: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_922: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_923 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_923 +$L$_small_initial_partial_block_923: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_923: + + or r13,r13 + je NEAR $L$_after_reduction_923 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_923: + jmp NEAR $L$_last_blocks_done_899 +$L$_last_num_blocks_is_13_899: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_924 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_924 + +$L$_16_blocks_overflow_924: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_924: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb xmm21,xmm21,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_925 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_925 +$L$_small_initial_partial_block_925: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_925: + + or r13,r13 + je NEAR $L$_after_reduction_925 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_925: + jmp NEAR $L$_last_blocks_done_899 +$L$_last_num_blocks_is_14_899: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_926 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_926 + +$L$_16_blocks_overflow_926: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_926: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb ymm21,ymm21,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_927 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_927 +$L$_small_initial_partial_block_927: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_927: + + or r13,r13 + je NEAR $L$_after_reduction_927 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_927: + jmp NEAR $L$_last_blocks_done_899 +$L$_last_num_blocks_is_15_899: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_928 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_928 + +$L$_16_blocks_overflow_928: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_928: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_929 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_929 +$L$_small_initial_partial_block_929: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_929: + + or r13,r13 + je NEAR $L$_after_reduction_929 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_929: + jmp NEAR $L$_last_blocks_done_899 +$L$_last_num_blocks_is_16_899: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_930 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_930 + +$L$_16_blocks_overflow_930: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_930: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vmovdqa64 zmm8,ZMMWORD[1280+rsp] + vmovdqu64 zmm1,ZMMWORD[512+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[576+rsp] + vmovdqa64 zmm22,ZMMWORD[1344+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[640+rsp] + vmovdqa64 zmm8,ZMMWORD[1408+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[704+rsp] + vmovdqa64 zmm22,ZMMWORD[1472+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpternlogq zmm14,zmm24,zmm12,0x96 + vpternlogq zmm7,zmm25,zmm13,0x96 + vpternlogq zmm10,zmm26,zmm15,0x96 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vpsrldq zmm15,zmm10,8 + vpslldq zmm10,zmm10,8 + + vmovdqa64 xmm16,XMMWORD[POLY2] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vpxorq zmm14,zmm14,zmm15 + vpxorq zmm7,zmm7,zmm10 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vextracti64x4 ymm12,zmm14,1 + vpxorq ymm14,ymm14,ymm12 + vextracti32x4 xmm12,ymm14,1 + vpxorq xmm14,xmm14,xmm12 + vextracti64x4 ymm13,zmm7,1 + vpxorq ymm7,ymm7,ymm13 + vextracti32x4 xmm13,ymm7,1 + vpxorq xmm7,xmm7,xmm13 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vpclmulqdq xmm13,xmm16,xmm7,0x01 + vpslldq xmm13,xmm13,8 + vpxorq xmm13,xmm7,xmm13 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vpclmulqdq xmm12,xmm16,xmm13,0x00 + vpsrldq xmm12,xmm12,4 + vpclmulqdq xmm15,xmm16,xmm13,0x10 + vpslldq xmm15,xmm15,4 + + vpternlogq xmm14,xmm15,xmm12,0x96 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_931: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vpxorq zmm17,zmm17,zmm14 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm31 + vpxorq zmm0,zmm0,zmm8 + vpxorq zmm3,zmm3,zmm22 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_931: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_931: + jmp NEAR $L$_last_blocks_done_899 +$L$_last_num_blocks_is_0_899: + vmovdqa64 zmm13,ZMMWORD[1280+rsp] + vmovdqu64 zmm12,ZMMWORD[512+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1344+rsp] + vmovdqu64 zmm12,ZMMWORD[576+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[1408+rsp] + vmovdqu64 zmm12,ZMMWORD[640+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[1472+rsp] + vmovdqu64 zmm12,ZMMWORD[704+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_899: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_821 + +$L$_message_below_32_blocks_821: + + + sub r13,256 + add r11,256 + mov r10d,r13d + test r14,r14 + jnz NEAR $L$_skip_hkeys_precomputation_932 + vmovdqu64 zmm3,ZMMWORD[640+rsp] + + + vshufi64x2 zmm3,zmm3,zmm3,0x00 + + vmovdqu64 zmm4,ZMMWORD[576+rsp] + vmovdqu64 zmm5,ZMMWORD[512+rsp] + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[448+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[384+rsp],zmm5 + + vpclmulqdq zmm6,zmm4,zmm3,0x11 + vpclmulqdq zmm7,zmm4,zmm3,0x00 + vpclmulqdq zmm10,zmm4,zmm3,0x01 + vpclmulqdq zmm4,zmm4,zmm3,0x10 + vpxorq zmm4,zmm4,zmm10 + + vpsrldq zmm10,zmm4,8 + vpslldq zmm4,zmm4,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm4,zmm4,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm4,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm4,zmm4,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm4,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm4,zmm10,zmm4,0x10 + vpslldq zmm4,zmm4,4 + + vpternlogq zmm4,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[320+rsp],zmm4 + + vpclmulqdq zmm6,zmm5,zmm3,0x11 + vpclmulqdq zmm7,zmm5,zmm3,0x00 + vpclmulqdq zmm10,zmm5,zmm3,0x01 + vpclmulqdq zmm5,zmm5,zmm3,0x10 + vpxorq zmm5,zmm5,zmm10 + + vpsrldq zmm10,zmm5,8 + vpslldq zmm5,zmm5,8 + vpxorq zmm6,zmm6,zmm10 + vpxorq zmm5,zmm5,zmm7 + + + + vmovdqu64 zmm10,ZMMWORD[POLY2] + + vpclmulqdq zmm7,zmm10,zmm5,0x01 + vpslldq zmm7,zmm7,8 + vpxorq zmm5,zmm5,zmm7 + + + + vpclmulqdq zmm7,zmm10,zmm5,0x00 + vpsrldq zmm7,zmm7,4 + vpclmulqdq zmm5,zmm10,zmm5,0x10 + vpslldq zmm5,zmm5,4 + + vpternlogq zmm5,zmm6,zmm7,0x96 + + vmovdqu64 ZMMWORD[256+rsp],zmm5 +$L$_skip_hkeys_precomputation_932: + mov r14,1 + and r10d,~15 + mov ebx,512 + sub ebx,r10d + mov r10d,r13d + add r10d,15 + shr r10d,4 + je NEAR $L$_last_num_blocks_is_0_933 + + cmp r10d,8 + je NEAR $L$_last_num_blocks_is_8_933 + jb NEAR $L$_last_num_blocks_is_7_1_933 + + + cmp r10d,12 + je NEAR $L$_last_num_blocks_is_12_933 + jb NEAR $L$_last_num_blocks_is_11_9_933 + + + cmp r10d,15 + je NEAR $L$_last_num_blocks_is_15_933 + ja NEAR $L$_last_num_blocks_is_16_933 + cmp r10d,14 + je NEAR $L$_last_num_blocks_is_14_933 + jmp NEAR $L$_last_num_blocks_is_13_933 + +$L$_last_num_blocks_is_11_9_933: + + cmp r10d,10 + je NEAR $L$_last_num_blocks_is_10_933 + ja NEAR $L$_last_num_blocks_is_11_933 + jmp NEAR $L$_last_num_blocks_is_9_933 + +$L$_last_num_blocks_is_7_1_933: + cmp r10d,4 + je NEAR $L$_last_num_blocks_is_4_933 + jb NEAR $L$_last_num_blocks_is_3_1_933 + + cmp r10d,6 + ja NEAR $L$_last_num_blocks_is_7_933 + je NEAR $L$_last_num_blocks_is_6_933 + jmp NEAR $L$_last_num_blocks_is_5_933 + +$L$_last_num_blocks_is_3_1_933: + + cmp r10d,2 + ja NEAR $L$_last_num_blocks_is_3_933 + je NEAR $L$_last_num_blocks_is_2_933 +$L$_last_num_blocks_is_1_933: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,255 + jae NEAR $L$_16_blocks_overflow_934 + vpaddd xmm0,xmm2,xmm28 + jmp NEAR $L$_16_blocks_ok_934 + +$L$_16_blocks_overflow_934: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb xmm0,xmm0,xmm29 +$L$_16_blocks_ok_934: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 xmm17{k1}{z},[r11*1+r9] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc xmm0,xmm0,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc xmm0,xmm0,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc xmm0,xmm0,xmm31 + vaesenclast xmm0,xmm0,xmm30 + vpxorq xmm0,xmm0,xmm17 + vextracti32x4 xmm11,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb xmm17,xmm17,xmm29 + vextracti32x4 xmm7,zmm17,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_935 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_935 +$L$_small_initial_partial_block_935: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm0,XMMWORD[POLY2] + + + vpclmulqdq xmm3,xmm0,xmm25,0x01 + vpslldq xmm3,xmm3,8 + vpxorq xmm3,xmm25,xmm3 + + + vpclmulqdq xmm4,xmm0,xmm3,0x00 + vpsrldq xmm4,xmm4,4 + vpclmulqdq xmm14,xmm0,xmm3,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm4,xmm24,0x96 + + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm7 + + jmp NEAR $L$_after_reduction_935 +$L$_small_initial_compute_done_935: +$L$_after_reduction_935: + jmp NEAR $L$_last_blocks_done_933 +$L$_last_num_blocks_is_2_933: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,254 + jae NEAR $L$_16_blocks_overflow_936 + vpaddd ymm0,ymm2,ymm28 + jmp NEAR $L$_16_blocks_ok_936 + +$L$_16_blocks_overflow_936: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb ymm0,ymm0,ymm29 +$L$_16_blocks_ok_936: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 ymm17{k1}{z},[r11*1+r9] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc ymm0,ymm0,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc ymm0,ymm0,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc ymm0,ymm0,ymm31 + vaesenclast ymm0,ymm0,ymm30 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm11,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb ymm17,ymm17,ymm29 + vextracti32x4 xmm7,zmm17,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_937 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_937 +$L$_small_initial_partial_block_937: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm17,xmm1,0x01 + vpclmulqdq xmm5,xmm17,xmm1,0x10 + vpclmulqdq xmm0,xmm17,xmm1,0x11 + vpclmulqdq xmm3,xmm17,xmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_937: + + or r13,r13 + je NEAR $L$_after_reduction_937 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_937: + jmp NEAR $L$_last_blocks_done_933 +$L$_last_num_blocks_is_3_933: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,253 + jae NEAR $L$_16_blocks_overflow_938 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_938 + +$L$_16_blocks_overflow_938: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_938: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_939 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_939 +$L$_small_initial_partial_block_939: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm17,ymm1,0x01 + vpclmulqdq ymm5,ymm17,ymm1,0x10 + vpclmulqdq ymm0,ymm17,ymm1,0x11 + vpclmulqdq ymm3,ymm17,ymm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_939: + + or r13,r13 + je NEAR $L$_after_reduction_939 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_939: + jmp NEAR $L$_last_blocks_done_933 +$L$_last_num_blocks_is_4_933: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + kmovq k1,[rax*8+r10] + cmp r15d,252 + jae NEAR $L$_16_blocks_overflow_940 + vpaddd zmm0,zmm2,zmm28 + jmp NEAR $L$_16_blocks_ok_940 + +$L$_16_blocks_overflow_940: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpshufb zmm0,zmm0,zmm29 +$L$_16_blocks_ok_940: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm0,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17{k1}{z},[r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vpxorq zmm0,zmm0,zmm17 + vextracti32x4 xmm11,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm17{k1}{z},zmm17 + vpshufb zmm17,zmm17,zmm29 + vextracti32x4 xmm7,zmm17,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_941 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_941 +$L$_small_initial_partial_block_941: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpxorq zmm4,zmm4,zmm26 + vpxorq zmm0,zmm0,zmm24 + vpxorq zmm3,zmm3,zmm25 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_941: + + or r13,r13 + je NEAR $L$_after_reduction_941 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_941: + jmp NEAR $L$_last_blocks_done_933 +$L$_last_num_blocks_is_5_933: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,251 + jae NEAR $L$_16_blocks_overflow_942 + vpaddd zmm0,zmm2,zmm28 + vpaddd xmm3,xmm0,xmm27 + jmp NEAR $L$_16_blocks_ok_942 + +$L$_16_blocks_overflow_942: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 +$L$_16_blocks_ok_942: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 xmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc xmm3,xmm3,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc xmm3,xmm3,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast xmm3,xmm3,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq xmm3,xmm3,xmm19 + vextracti32x4 xmm11,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb xmm19,xmm19,xmm29 + vextracti32x4 xmm7,zmm19,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_943 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_943 +$L$_small_initial_partial_block_943: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_943: + + or r13,r13 + je NEAR $L$_after_reduction_943 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_943: + jmp NEAR $L$_last_blocks_done_933 +$L$_last_num_blocks_is_6_933: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,250 + jae NEAR $L$_16_blocks_overflow_944 + vpaddd zmm0,zmm2,zmm28 + vpaddd ymm3,ymm0,ymm27 + jmp NEAR $L$_16_blocks_ok_944 + +$L$_16_blocks_overflow_944: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 +$L$_16_blocks_ok_944: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 ymm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc ymm3,ymm3,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc ymm3,ymm3,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast ymm3,ymm3,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm11,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb ymm19,ymm19,ymm29 + vextracti32x4 xmm7,zmm19,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_945 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_945 +$L$_small_initial_partial_block_945: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm19,xmm1,0x01 + vpclmulqdq xmm5,xmm19,xmm1,0x10 + vpclmulqdq xmm0,xmm19,xmm1,0x11 + vpclmulqdq xmm3,xmm19,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_945: + + or r13,r13 + je NEAR $L$_after_reduction_945 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_945: + jmp NEAR $L$_last_blocks_done_933 +$L$_last_num_blocks_is_7_933: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,249 + jae NEAR $L$_16_blocks_overflow_946 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_946 + +$L$_16_blocks_overflow_946: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_946: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_947 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_947 +$L$_small_initial_partial_block_947: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm19,ymm1,0x01 + vpclmulqdq ymm5,ymm19,ymm1,0x10 + vpclmulqdq ymm0,ymm19,ymm1,0x11 + vpclmulqdq ymm3,ymm19,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_947: + + or r13,r13 + je NEAR $L$_after_reduction_947 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_947: + jmp NEAR $L$_last_blocks_done_933 +$L$_last_num_blocks_is_8_933: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,64 + kmovq k1,[rax*8+r10] + cmp r15d,248 + jae NEAR $L$_16_blocks_overflow_948 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + jmp NEAR $L$_16_blocks_ok_948 + +$L$_16_blocks_overflow_948: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 +$L$_16_blocks_ok_948: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm3,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19{k1}{z},[64+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti32x4 xmm11,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm19{k1}{z},zmm19 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vextracti32x4 xmm7,zmm19,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_949 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_949 +$L$_small_initial_partial_block_949: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm17,zmm1,0x11 + vpclmulqdq zmm22,zmm17,zmm1,0x00 + vpclmulqdq zmm30,zmm17,zmm1,0x01 + vpclmulqdq zmm31,zmm17,zmm1,0x10 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm19,zmm1,0x01 + vpclmulqdq zmm5,zmm19,zmm1,0x10 + vpclmulqdq zmm0,zmm19,zmm1,0x11 + vpclmulqdq zmm3,zmm19,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_949: + + or r13,r13 + je NEAR $L$_after_reduction_949 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_949: + jmp NEAR $L$_last_blocks_done_933 +$L$_last_num_blocks_is_9_933: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,247 + jae NEAR $L$_16_blocks_overflow_950 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd xmm4,xmm3,xmm27 + jmp NEAR $L$_16_blocks_ok_950 + +$L$_16_blocks_overflow_950: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 +$L$_16_blocks_ok_950: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc xmm4,xmm4,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc xmm4,xmm4,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast xmm4,xmm4,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq xmm4,xmm4,xmm20 + vextracti32x4 xmm11,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb xmm20,xmm20,xmm29 + vextracti32x4 xmm7,zmm20,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_951 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_951 +$L$_small_initial_partial_block_951: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_951: + + or r13,r13 + je NEAR $L$_after_reduction_951 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_951: + jmp NEAR $L$_last_blocks_done_933 +$L$_last_num_blocks_is_10_933: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,246 + jae NEAR $L$_16_blocks_overflow_952 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd ymm4,ymm3,ymm27 + jmp NEAR $L$_16_blocks_ok_952 + +$L$_16_blocks_overflow_952: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 +$L$_16_blocks_ok_952: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc ymm4,ymm4,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc ymm4,ymm4,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast ymm4,ymm4,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq ymm4,ymm4,ymm20 + vextracti32x4 xmm11,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb ymm20,ymm20,ymm29 + vextracti32x4 xmm7,zmm20,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_953 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_953 +$L$_small_initial_partial_block_953: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm20,xmm1,0x01 + vpclmulqdq xmm5,xmm20,xmm1,0x10 + vpclmulqdq xmm0,xmm20,xmm1,0x11 + vpclmulqdq xmm3,xmm20,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_953: + + or r13,r13 + je NEAR $L$_after_reduction_953 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_953: + jmp NEAR $L$_last_blocks_done_933 +$L$_last_num_blocks_is_11_933: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,245 + jae NEAR $L$_16_blocks_overflow_954 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_954 + +$L$_16_blocks_overflow_954: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_954: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_955 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_955 +$L$_small_initial_partial_block_955: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm20,ymm1,0x01 + vpclmulqdq ymm5,ymm20,ymm1,0x10 + vpclmulqdq ymm0,ymm20,ymm1,0x11 + vpclmulqdq ymm3,ymm20,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_955: + + or r13,r13 + je NEAR $L$_after_reduction_955 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_955: + jmp NEAR $L$_last_blocks_done_933 +$L$_last_num_blocks_is_12_933: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,128 + kmovq k1,[rax*8+r10] + cmp r15d,244 + jae NEAR $L$_16_blocks_overflow_956 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + jmp NEAR $L$_16_blocks_ok_956 + +$L$_16_blocks_overflow_956: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 +$L$_16_blocks_ok_956: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm4,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20{k1}{z},[128+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vextracti32x4 xmm11,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm20{k1}{z},zmm20 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vextracti32x4 xmm7,zmm20,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_957 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_957 +$L$_small_initial_partial_block_957: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vpxorq zmm8,zmm0,zmm8 + vpxorq zmm22,zmm3,zmm22 + vpxorq zmm30,zmm4,zmm30 + vpxorq zmm31,zmm5,zmm31 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm20,zmm1,0x01 + vpclmulqdq zmm5,zmm20,zmm1,0x10 + vpclmulqdq zmm0,zmm20,zmm1,0x11 + vpclmulqdq zmm3,zmm20,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_957: + + or r13,r13 + je NEAR $L$_after_reduction_957 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_957: + jmp NEAR $L$_last_blocks_done_933 +$L$_last_num_blocks_is_13_933: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,243 + jae NEAR $L$_16_blocks_overflow_958 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd xmm5,xmm4,xmm27 + jmp NEAR $L$_16_blocks_ok_958 + +$L$_16_blocks_overflow_958: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 +$L$_16_blocks_ok_958: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,0 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc xmm5,xmm5,xmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc xmm5,xmm5,xmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast xmm5,xmm5,xmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq xmm5,xmm5,xmm21 + vextracti32x4 xmm11,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb xmm21,xmm21,xmm29 + vextracti32x4 xmm7,zmm21,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_959 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_959 +$L$_small_initial_partial_block_959: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[224+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[288+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + + vpxorq zmm30,zmm30,zmm26 + vpxorq zmm8,zmm8,zmm24 + vpxorq zmm22,zmm22,zmm25 + + vpxorq zmm30,zmm30,zmm31 + vpsrldq zmm4,zmm30,8 + vpslldq zmm5,zmm30,8 + vpxorq zmm0,zmm8,zmm4 + vpxorq zmm3,zmm22,zmm5 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_959: + + or r13,r13 + je NEAR $L$_after_reduction_959 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_959: + jmp NEAR $L$_last_blocks_done_933 +$L$_last_num_blocks_is_14_933: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,242 + jae NEAR $L$_16_blocks_overflow_960 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd ymm5,ymm4,ymm27 + jmp NEAR $L$_16_blocks_ok_960 + +$L$_16_blocks_overflow_960: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 +$L$_16_blocks_ok_960: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,1 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc ymm5,ymm5,ymm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc ymm5,ymm5,ymm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast ymm5,ymm5,ymm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq ymm5,ymm5,ymm21 + vextracti32x4 xmm11,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb ymm21,ymm21,ymm29 + vextracti32x4 xmm7,zmm21,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_961 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_961 +$L$_small_initial_partial_block_961: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[208+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[272+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 xmm1,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm21,xmm1,0x01 + vpclmulqdq xmm5,xmm21,xmm1,0x10 + vpclmulqdq xmm0,xmm21,xmm1,0x11 + vpclmulqdq xmm3,xmm21,xmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_961: + + or r13,r13 + je NEAR $L$_after_reduction_961 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_961: + jmp NEAR $L$_last_blocks_done_933 +$L$_last_num_blocks_is_15_933: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,241 + jae NEAR $L$_16_blocks_overflow_962 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_962 + +$L$_16_blocks_overflow_962: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_962: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,2 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_963 + + + + + + sub r13,16 + mov QWORD[r8],0 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_963 +$L$_small_initial_partial_block_963: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[192+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[256+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm21,ymm1,0x01 + vpclmulqdq ymm5,ymm21,ymm1,0x10 + vpclmulqdq ymm0,ymm21,ymm1,0x11 + vpclmulqdq ymm3,ymm21,ymm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_963: + + or r13,r13 + je NEAR $L$_after_reduction_963 + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_963: + jmp NEAR $L$_last_blocks_done_933 +$L$_last_num_blocks_is_16_933: + lea r10,[byte64_len_to_mask_table] + mov rax,r13 + sub rax,192 + kmovq k1,[rax*8+r10] + cmp r15d,240 + jae NEAR $L$_16_blocks_overflow_964 + vpaddd zmm0,zmm2,zmm28 + vpaddd zmm3,zmm0,zmm27 + vpaddd zmm4,zmm3,zmm27 + vpaddd zmm5,zmm4,zmm27 + jmp NEAR $L$_16_blocks_ok_964 + +$L$_16_blocks_overflow_964: + vpshufb zmm2,zmm2,zmm29 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vmovdqa64 zmm5,ZMMWORD[ddq_add_4444] + vpaddd zmm3,zmm0,zmm5 + vpaddd zmm4,zmm3,zmm5 + vpaddd zmm5,zmm4,zmm5 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 +$L$_16_blocks_ok_964: + + + + + vbroadcastf64x2 zmm30,ZMMWORD[rcx] + vpxorq zmm8,zmm14,ZMMWORD[768+rsp] + vmovdqu64 zmm1,ZMMWORD[rbx*1+rsp] + vextracti32x4 xmm2,zmm5,3 + vshufi64x2 zmm2,zmm2,zmm2,0 + + + vbroadcastf64x2 zmm31,ZMMWORD[16+rcx] + vmovdqu64 zmm18,ZMMWORD[64+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[832+rsp] + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm30 + vpxorq zmm4,zmm4,zmm30 + vpxorq zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[32+rcx] + + + vpclmulqdq zmm14,zmm8,zmm1,0x11 + vpclmulqdq zmm7,zmm8,zmm1,0x00 + vpclmulqdq zmm10,zmm8,zmm1,0x01 + vpclmulqdq zmm11,zmm8,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[128+rbx*1+rsp] + vmovdqa64 zmm8,ZMMWORD[896+rsp] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[48+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vmovdqu64 zmm18,ZMMWORD[192+rbx*1+rsp] + vmovdqa64 zmm22,ZMMWORD[960+rsp] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[64+rcx] + + + vpclmulqdq zmm20,zmm8,zmm1,0x10 + vpclmulqdq zmm21,zmm8,zmm1,0x01 + vpclmulqdq zmm17,zmm8,zmm1,0x11 + vpclmulqdq zmm19,zmm8,zmm1,0x00 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[80+rcx] + + + vpternlogq zmm14,zmm12,zmm17,0x96 + vpternlogq zmm7,zmm13,zmm19,0x96 + vpternlogq zmm11,zmm16,zmm21,0x96 + vpternlogq zmm10,zmm15,zmm20,0x96 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[96+rcx] + vmovdqu8 zmm17,ZMMWORD[r11*1+r9] + vmovdqu8 zmm19,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm20,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm21{k1}{z},[192+r11*1+r9] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[112+rcx] + + + vpclmulqdq zmm15,zmm22,zmm18,0x10 + vpclmulqdq zmm16,zmm22,zmm18,0x01 + vpclmulqdq zmm12,zmm22,zmm18,0x11 + vpclmulqdq zmm13,zmm22,zmm18,0x00 + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[128+rcx] + vpternlogq zmm10,zmm11,zmm16,0x96 + vpxorq zmm24,zmm14,zmm12 + vpxorq zmm25,zmm7,zmm13 + vpxorq zmm26,zmm10,zmm15 + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vbroadcastf64x2 zmm31,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm30 + vaesenc zmm3,zmm3,zmm30 + vaesenc zmm4,zmm4,zmm30 + vaesenc zmm5,zmm5,zmm30 + vbroadcastf64x2 zmm30,ZMMWORD[224+rcx] + vaesenc zmm0,zmm0,zmm31 + vaesenc zmm3,zmm3,zmm31 + vaesenc zmm4,zmm4,zmm31 + vaesenc zmm5,zmm5,zmm31 + vaesenclast zmm0,zmm0,zmm30 + vaesenclast zmm3,zmm3,zmm30 + vaesenclast zmm4,zmm4,zmm30 + vaesenclast zmm5,zmm5,zmm30 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vpxorq zmm4,zmm4,zmm20 + vpxorq zmm5,zmm5,zmm21 + vextracti32x4 xmm11,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm21{k1}{z},zmm21 + vpshufb zmm17,zmm17,zmm29 + vpshufb zmm19,zmm19,zmm29 + vpshufb zmm20,zmm20,zmm29 + vpshufb zmm21,zmm21,zmm29 + vextracti32x4 xmm7,zmm21,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_965: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm11 + vmovdqu64 zmm1,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm17,zmm1,0x11 + vpclmulqdq zmm3,zmm17,zmm1,0x00 + vpclmulqdq zmm4,zmm17,zmm1,0x01 + vpclmulqdq zmm5,zmm17,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[176+rdx] + vpclmulqdq zmm8,zmm19,zmm1,0x11 + vpclmulqdq zmm22,zmm19,zmm1,0x00 + vpclmulqdq zmm30,zmm19,zmm1,0x01 + vpclmulqdq zmm31,zmm19,zmm1,0x10 + vmovdqu64 zmm1,ZMMWORD[240+rdx] + vpclmulqdq zmm17,zmm20,zmm1,0x11 + vpclmulqdq zmm19,zmm20,zmm1,0x00 + vpternlogq zmm8,zmm17,zmm0,0x96 + vpternlogq zmm22,zmm19,zmm3,0x96 + vpclmulqdq zmm17,zmm20,zmm1,0x01 + vpclmulqdq zmm19,zmm20,zmm1,0x10 + vpternlogq zmm30,zmm17,zmm4,0x96 + vpternlogq zmm31,zmm19,zmm5,0x96 + vmovdqu64 ymm1,YMMWORD[304+rdx] + vinserti64x2 zmm1,zmm1,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm21,zmm1,0x01 + vpclmulqdq zmm5,zmm21,zmm1,0x10 + vpclmulqdq zmm0,zmm21,zmm1,0x11 + vpclmulqdq zmm3,zmm21,zmm1,0x00 + + vpxorq zmm4,zmm4,zmm30 + vpternlogq zmm5,zmm26,zmm31,0x96 + vpternlogq zmm0,zmm24,zmm8,0x96 + vpternlogq zmm3,zmm25,zmm22,0x96 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm30,zmm4,8 + vpslldq zmm31,zmm4,8 + vpxorq zmm0,zmm0,zmm30 + vpxorq zmm3,zmm3,zmm31 + vextracti64x4 ymm30,zmm0,1 + vpxorq ymm0,ymm0,ymm30 + vextracti32x4 xmm30,ymm0,1 + vpxorq xmm0,xmm0,xmm30 + vextracti64x4 ymm31,zmm3,1 + vpxorq ymm3,ymm3,ymm31 + vextracti32x4 xmm31,ymm3,1 + vpxorq xmm3,xmm3,xmm31 + vmovdqa64 xmm1,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm1,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm1,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm1,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_965: + vpxorq xmm14,xmm14,xmm7 +$L$_after_reduction_965: + jmp NEAR $L$_last_blocks_done_933 +$L$_last_num_blocks_is_0_933: + vmovdqa64 zmm13,ZMMWORD[768+rsp] + vpxorq zmm13,zmm13,zmm14 + vmovdqu64 zmm12,ZMMWORD[rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[832+rsp] + vmovdqu64 zmm12,ZMMWORD[64+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + vpxorq zmm26,zmm4,zmm10 + vpxorq zmm24,zmm0,zmm6 + vpxorq zmm25,zmm3,zmm7 + vpternlogq zmm26,zmm5,zmm11,0x96 + vmovdqa64 zmm13,ZMMWORD[896+rsp] + vmovdqu64 zmm12,ZMMWORD[128+rbx*1+rsp] + vpclmulqdq zmm0,zmm13,zmm12,0x11 + vpclmulqdq zmm3,zmm13,zmm12,0x00 + vpclmulqdq zmm4,zmm13,zmm12,0x01 + vpclmulqdq zmm5,zmm13,zmm12,0x10 + vmovdqa64 zmm13,ZMMWORD[960+rsp] + vmovdqu64 zmm12,ZMMWORD[192+rbx*1+rsp] + vpclmulqdq zmm6,zmm13,zmm12,0x11 + vpclmulqdq zmm7,zmm13,zmm12,0x00 + vpclmulqdq zmm10,zmm13,zmm12,0x01 + vpclmulqdq zmm11,zmm13,zmm12,0x10 + + vpternlogq zmm26,zmm4,zmm10,0x96 + vpternlogq zmm24,zmm0,zmm6,0x96 + vpternlogq zmm25,zmm3,zmm7,0x96 + vpternlogq zmm26,zmm5,zmm11,0x96 + + vpsrldq zmm0,zmm26,8 + vpslldq zmm3,zmm26,8 + vpxorq zmm24,zmm24,zmm0 + vpxorq zmm25,zmm25,zmm3 + vextracti64x4 ymm0,zmm24,1 + vpxorq ymm24,ymm24,ymm0 + vextracti32x4 xmm0,ymm24,1 + vpxorq xmm24,xmm24,xmm0 + vextracti64x4 ymm3,zmm25,1 + vpxorq ymm25,ymm25,ymm3 + vextracti32x4 xmm3,ymm25,1 + vpxorq xmm25,xmm25,xmm3 + vmovdqa64 xmm4,XMMWORD[POLY2] + + + vpclmulqdq xmm0,xmm4,xmm25,0x01 + vpslldq xmm0,xmm0,8 + vpxorq xmm0,xmm25,xmm0 + + + vpclmulqdq xmm3,xmm4,xmm0,0x00 + vpsrldq xmm3,xmm3,4 + vpclmulqdq xmm14,xmm4,xmm0,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm3,xmm24,0x96 + +$L$_last_blocks_done_933: + vpshufb xmm2,xmm2,xmm29 + jmp NEAR $L$_ghash_done_821 + +$L$_message_below_equal_16_blocks_821: + + + mov r12d,r13d + add r12d,15 + shr r12d,4 + cmp r12,8 + je NEAR $L$_small_initial_num_blocks_is_8_966 + jl NEAR $L$_small_initial_num_blocks_is_7_1_966 + + + cmp r12,12 + je NEAR $L$_small_initial_num_blocks_is_12_966 + jl NEAR $L$_small_initial_num_blocks_is_11_9_966 + + + cmp r12,16 + je NEAR $L$_small_initial_num_blocks_is_16_966 + cmp r12,15 + je NEAR $L$_small_initial_num_blocks_is_15_966 + cmp r12,14 + je NEAR $L$_small_initial_num_blocks_is_14_966 + jmp NEAR $L$_small_initial_num_blocks_is_13_966 + +$L$_small_initial_num_blocks_is_11_9_966: + + cmp r12,11 + je NEAR $L$_small_initial_num_blocks_is_11_966 + cmp r12,10 + je NEAR $L$_small_initial_num_blocks_is_10_966 + jmp NEAR $L$_small_initial_num_blocks_is_9_966 + +$L$_small_initial_num_blocks_is_7_1_966: + cmp r12,4 + je NEAR $L$_small_initial_num_blocks_is_4_966 + jl NEAR $L$_small_initial_num_blocks_is_3_1_966 + + cmp r12,7 + je NEAR $L$_small_initial_num_blocks_is_7_966 + cmp r12,6 + je NEAR $L$_small_initial_num_blocks_is_6_966 + jmp NEAR $L$_small_initial_num_blocks_is_5_966 + +$L$_small_initial_num_blocks_is_3_1_966: + + cmp r12,3 + je NEAR $L$_small_initial_num_blocks_is_3_966 + cmp r12,2 + je NEAR $L$_small_initial_num_blocks_is_2_966 + + + + + +$L$_small_initial_num_blocks_is_1_966: + vmovdqa64 xmm29,XMMWORD[SHUF_MASK] + vpaddd xmm0,xmm2,XMMWORD[ONE] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,0 + vpshufb xmm0,xmm0,xmm29 + vmovdqu8 xmm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc xmm0,xmm0,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast xmm0,xmm0,xmm15 + vpxorq xmm0,xmm0,xmm6 + vextracti32x4 xmm12,zmm0,0 + mov r10,QWORD[120+rbp] + vmovdqu8 XMMWORD[r11*1+r10]{k1},xmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb xmm6,xmm6,xmm29 + vextracti32x4 xmm13,zmm6,0 + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_967 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm6,xmm20,0x01 + vpclmulqdq xmm5,xmm6,xmm20,0x10 + vpclmulqdq xmm0,xmm6,xmm20,0x11 + vpclmulqdq xmm3,xmm6,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_967 +$L$_small_initial_partial_block_967: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + + + + + + + + + + + + vpxorq xmm14,xmm14,xmm13 + + jmp NEAR $L$_after_reduction_967 +$L$_small_initial_compute_done_967: +$L$_after_reduction_967: + jmp NEAR $L$_small_initial_blocks_encrypted_966 +$L$_small_initial_num_blocks_is_2_966: + vmovdqa64 ymm29,YMMWORD[SHUF_MASK] + vshufi64x2 ymm0,ymm2,ymm2,0 + vpaddd ymm0,ymm0,YMMWORD[ddq_add_1234] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,1 + vpshufb ymm0,ymm0,ymm29 + vmovdqu8 ymm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc ymm0,ymm0,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast ymm0,ymm0,ymm15 + vpxorq ymm0,ymm0,ymm6 + vextracti32x4 xmm12,zmm0,1 + mov r10,QWORD[120+rbp] + vmovdqu8 YMMWORD[r11*1+r10]{k1},ymm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb ymm6,ymm6,ymm29 + vextracti32x4 xmm13,zmm6,1 + sub r13,16 * (2 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_968 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm6,ymm20,0x01 + vpclmulqdq ymm5,ymm6,ymm20,0x10 + vpclmulqdq ymm0,ymm6,ymm20,0x11 + vpclmulqdq ymm3,ymm6,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_968 +$L$_small_initial_partial_block_968: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm6,xmm20,0x01 + vpclmulqdq xmm5,xmm6,xmm20,0x10 + vpclmulqdq xmm0,xmm6,xmm20,0x11 + vpclmulqdq xmm3,xmm6,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_968: + + or r13,r13 + je NEAR $L$_after_reduction_968 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_968: + jmp NEAR $L$_small_initial_blocks_encrypted_966 +$L$_small_initial_num_blocks_is_3_966: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,2 + vpshufb zmm0,zmm0,zmm29 + vmovdqu8 zmm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vpxorq zmm0,zmm0,zmm6 + vextracti32x4 xmm12,zmm0,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm6,zmm6,zmm29 + vextracti32x4 xmm13,zmm6,2 + sub r13,16 * (3 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_969 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_969 +$L$_small_initial_partial_block_969: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm6,ymm20,0x01 + vpclmulqdq ymm5,ymm6,ymm20,0x10 + vpclmulqdq ymm0,ymm6,ymm20,0x11 + vpclmulqdq ymm3,ymm6,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_969: + + or r13,r13 + je NEAR $L$_after_reduction_969 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_969: + jmp NEAR $L$_small_initial_blocks_encrypted_966 +$L$_small_initial_num_blocks_is_4_966: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm0,3 + vpshufb zmm0,zmm0,zmm29 + vmovdqu8 zmm6{k1}{z},[r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vpxorq zmm0,zmm0,zmm6 + vextracti32x4 xmm12,zmm0,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10]{k1},zmm0 + vmovdqu8 zmm0{k1}{z},zmm0 + vpshufb zmm6,zmm6,zmm29 + vextracti32x4 xmm13,zmm6,3 + sub r13,16 * (4 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_970 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_970 +$L$_small_initial_partial_block_970: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_970: + + or r13,r13 + je NEAR $L$_after_reduction_970 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_970: + jmp NEAR $L$_small_initial_blocks_encrypted_966 +$L$_small_initial_num_blocks_is_5_966: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,0 + vpshufb zmm0,zmm0,zmm29 + vpshufb xmm3,xmm3,xmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 xmm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc xmm3,xmm3,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast xmm3,xmm3,xmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq xmm3,xmm3,xmm7 + vextracti32x4 xmm12,zmm3,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 XMMWORD[64+r11*1+r10]{k1},xmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm6,zmm29 + vpshufb xmm7,xmm7,xmm29 + vextracti32x4 xmm13,zmm7,0 + sub r13,16 * (5 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_971 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm7,xmm20,0x01 + vpclmulqdq xmm5,xmm7,xmm20,0x10 + vpclmulqdq xmm0,xmm7,xmm20,0x11 + vpclmulqdq xmm3,xmm7,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_971 +$L$_small_initial_partial_block_971: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_971: + + or r13,r13 + je NEAR $L$_after_reduction_971 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_971: + jmp NEAR $L$_small_initial_blocks_encrypted_966 +$L$_small_initial_num_blocks_is_6_966: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,1 + vpshufb zmm0,zmm0,zmm29 + vpshufb ymm3,ymm3,ymm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 ymm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc ymm3,ymm3,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast ymm3,ymm3,ymm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq ymm3,ymm3,ymm7 + vextracti32x4 xmm12,zmm3,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 YMMWORD[64+r11*1+r10]{k1},ymm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm6,zmm29 + vpshufb ymm7,ymm7,ymm29 + vextracti32x4 xmm13,zmm7,1 + sub r13,16 * (6 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_972 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm7,ymm20,0x01 + vpclmulqdq ymm5,ymm7,ymm20,0x10 + vpclmulqdq ymm0,ymm7,ymm20,0x11 + vpclmulqdq ymm3,ymm7,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_972 +$L$_small_initial_partial_block_972: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm7,xmm20,0x01 + vpclmulqdq xmm5,xmm7,xmm20,0x10 + vpclmulqdq xmm0,xmm7,xmm20,0x11 + vpclmulqdq xmm3,xmm7,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_972: + + or r13,r13 + je NEAR $L$_after_reduction_972 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_972: + jmp NEAR $L$_small_initial_blocks_encrypted_966 +$L$_small_initial_num_blocks_is_7_966: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,2 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vextracti32x4 xmm12,zmm3,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vextracti32x4 xmm13,zmm7,2 + sub r13,16 * (7 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_973 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm7,zmm20,0x01 + vpclmulqdq zmm5,zmm7,zmm20,0x10 + vpclmulqdq zmm0,zmm7,zmm20,0x11 + vpclmulqdq zmm3,zmm7,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_973 +$L$_small_initial_partial_block_973: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm7,ymm20,0x01 + vpclmulqdq ymm5,ymm7,ymm20,0x10 + vpclmulqdq ymm0,ymm7,ymm20,0x11 + vpclmulqdq ymm3,ymm7,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_973: + + or r13,r13 + je NEAR $L$_after_reduction_973 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_973: + jmp NEAR $L$_small_initial_blocks_encrypted_966 +$L$_small_initial_num_blocks_is_8_966: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,64 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm3,3 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7{k1}{z},[64+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vextracti32x4 xmm12,zmm3,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10]{k1},zmm3 + vmovdqu8 zmm3{k1}{z},zmm3 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vextracti32x4 xmm13,zmm7,3 + sub r13,16 * (8 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_974 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_974 +$L$_small_initial_partial_block_974: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm6,zmm20,0x11 + vpclmulqdq zmm16,zmm6,zmm20,0x00 + vpclmulqdq zmm17,zmm6,zmm20,0x01 + vpclmulqdq zmm19,zmm6,zmm20,0x10 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm7,zmm20,0x01 + vpclmulqdq zmm5,zmm7,zmm20,0x10 + vpclmulqdq zmm0,zmm7,zmm20,0x11 + vpclmulqdq zmm3,zmm7,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_974: + + or r13,r13 + je NEAR $L$_after_reduction_974 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_974: + jmp NEAR $L$_small_initial_blocks_encrypted_966 +$L$_small_initial_num_blocks_is_9_966: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,0 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb xmm4,xmm4,xmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 xmm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc xmm4,xmm4,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast xmm4,xmm4,xmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq xmm4,xmm4,xmm10 + vextracti32x4 xmm12,zmm4,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 XMMWORD[128+r11*1+r10]{k1},xmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb xmm10,xmm10,xmm29 + vextracti32x4 xmm13,zmm10,0 + sub r13,16 * (9 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_975 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm10,xmm20,0x01 + vpclmulqdq xmm5,xmm10,xmm20,0x10 + vpclmulqdq xmm0,xmm10,xmm20,0x11 + vpclmulqdq xmm3,xmm10,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_975 +$L$_small_initial_partial_block_975: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_975: + + or r13,r13 + je NEAR $L$_after_reduction_975 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_975: + jmp NEAR $L$_small_initial_blocks_encrypted_966 +$L$_small_initial_num_blocks_is_10_966: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,1 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb ymm4,ymm4,ymm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 ymm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc ymm4,ymm4,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast ymm4,ymm4,ymm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq ymm4,ymm4,ymm10 + vextracti32x4 xmm12,zmm4,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 YMMWORD[128+r11*1+r10]{k1},ymm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb ymm10,ymm10,ymm29 + vextracti32x4 xmm13,zmm10,1 + sub r13,16 * (10 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_976 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm10,ymm20,0x01 + vpclmulqdq ymm5,ymm10,ymm20,0x10 + vpclmulqdq ymm0,ymm10,ymm20,0x11 + vpclmulqdq ymm3,ymm10,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_976 +$L$_small_initial_partial_block_976: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm10,xmm20,0x01 + vpclmulqdq xmm5,xmm10,xmm20,0x10 + vpclmulqdq xmm0,xmm10,xmm20,0x11 + vpclmulqdq xmm3,xmm10,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_976: + + or r13,r13 + je NEAR $L$_after_reduction_976 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_976: + jmp NEAR $L$_small_initial_blocks_encrypted_966 +$L$_small_initial_num_blocks_is_11_966: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,2 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vextracti32x4 xmm12,zmm4,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vextracti32x4 xmm13,zmm10,2 + sub r13,16 * (11 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_977 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm10,zmm20,0x01 + vpclmulqdq zmm5,zmm10,zmm20,0x10 + vpclmulqdq zmm0,zmm10,zmm20,0x11 + vpclmulqdq zmm3,zmm10,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_977 +$L$_small_initial_partial_block_977: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm10,ymm20,0x01 + vpclmulqdq ymm5,ymm10,ymm20,0x10 + vpclmulqdq ymm0,ymm10,ymm20,0x11 + vpclmulqdq ymm3,ymm10,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_977: + + or r13,r13 + je NEAR $L$_after_reduction_977 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_977: + jmp NEAR $L$_small_initial_blocks_encrypted_966 +$L$_small_initial_num_blocks_is_12_966: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,128 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm4,3 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10{k1}{z},[128+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vextracti32x4 xmm12,zmm4,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10]{k1},zmm4 + vmovdqu8 zmm4{k1}{z},zmm4 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vextracti32x4 xmm13,zmm10,3 + sub r13,16 * (12 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_978 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_978 +$L$_small_initial_partial_block_978: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vpxorq zmm15,zmm0,zmm15 + vpxorq zmm16,zmm3,zmm16 + vpxorq zmm17,zmm4,zmm17 + vpxorq zmm19,zmm5,zmm19 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm10,zmm20,0x01 + vpclmulqdq zmm5,zmm10,zmm20,0x10 + vpclmulqdq zmm0,zmm10,zmm20,0x11 + vpclmulqdq zmm3,zmm10,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_978: + + or r13,r13 + je NEAR $L$_after_reduction_978 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_978: + jmp NEAR $L$_small_initial_blocks_encrypted_966 +$L$_small_initial_num_blocks_is_13_966: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,0 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb xmm5,xmm5,xmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 xmm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc xmm5,xmm5,xmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast xmm5,xmm5,xmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq xmm5,xmm5,xmm11 + vextracti32x4 xmm12,zmm5,0 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 XMMWORD[192+r11*1+r10]{k1},xmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb xmm11,xmm11,xmm29 + vextracti32x4 xmm13,zmm11,0 + sub r13,16 * (13 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_979 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm11,xmm20,0x01 + vpclmulqdq xmm5,xmm11,xmm20,0x10 + vpclmulqdq xmm0,xmm11,xmm20,0x11 + vpclmulqdq xmm3,xmm11,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_979 +$L$_small_initial_partial_block_979: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[160+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[224+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[288+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + + vpxorq zmm17,zmm17,zmm19 + vpsrldq zmm4,zmm17,8 + vpslldq zmm5,zmm17,8 + vpxorq zmm0,zmm15,zmm4 + vpxorq zmm3,zmm16,zmm5 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_979: + + or r13,r13 + je NEAR $L$_after_reduction_979 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_979: + jmp NEAR $L$_small_initial_blocks_encrypted_966 +$L$_small_initial_num_blocks_is_14_966: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,1 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb ymm5,ymm5,ymm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 ymm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc ymm5,ymm5,ymm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast ymm5,ymm5,ymm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq ymm5,ymm5,ymm11 + vextracti32x4 xmm12,zmm5,1 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 YMMWORD[192+r11*1+r10]{k1},ymm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb ymm11,ymm11,ymm29 + vextracti32x4 xmm13,zmm11,1 + sub r13,16 * (14 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_980 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm11,ymm20,0x01 + vpclmulqdq ymm5,ymm11,ymm20,0x10 + vpclmulqdq ymm0,ymm11,ymm20,0x11 + vpclmulqdq ymm3,ymm11,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_980 +$L$_small_initial_partial_block_980: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[144+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[208+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[272+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 xmm20,XMMWORD[336+rdx] + vpclmulqdq xmm4,xmm11,xmm20,0x01 + vpclmulqdq xmm5,xmm11,xmm20,0x10 + vpclmulqdq xmm0,xmm11,xmm20,0x11 + vpclmulqdq xmm3,xmm11,xmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_980: + + or r13,r13 + je NEAR $L$_after_reduction_980 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_980: + jmp NEAR $L$_small_initial_blocks_encrypted_966 +$L$_small_initial_num_blocks_is_15_966: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,2 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast zmm5,zmm5,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq zmm5,zmm5,zmm11 + vextracti32x4 xmm12,zmm5,2 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vextracti32x4 xmm13,zmm11,2 + sub r13,16 * (15 - 1) + + + cmp r13,16 + jl NEAR $L$_small_initial_partial_block_981 + + + + + + sub r13,16 + mov QWORD[r8],0 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm11,zmm20,0x01 + vpclmulqdq zmm5,zmm11,zmm20,0x10 + vpclmulqdq zmm0,zmm11,zmm20,0x11 + vpclmulqdq zmm3,zmm11,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + + jmp NEAR $L$_small_initial_compute_done_981 +$L$_small_initial_partial_block_981: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[128+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[192+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[256+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[320+rdx] + vpclmulqdq ymm4,ymm11,ymm20,0x01 + vpclmulqdq ymm5,ymm11,ymm20,0x10 + vpclmulqdq ymm0,ymm11,ymm20,0x11 + vpclmulqdq ymm3,ymm11,ymm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_981: + + or r13,r13 + je NEAR $L$_after_reduction_981 + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_981: + jmp NEAR $L$_small_initial_blocks_encrypted_966 +$L$_small_initial_num_blocks_is_16_966: + vmovdqa64 zmm29,ZMMWORD[SHUF_MASK] + vshufi64x2 zmm2,zmm2,zmm2,0 + vpaddd zmm0,zmm2,ZMMWORD[ddq_add_1234] + vpaddd zmm3,zmm2,ZMMWORD[ddq_add_5678] + vpaddd zmm4,zmm0,ZMMWORD[ddq_add_8888] + vpaddd zmm5,zmm3,ZMMWORD[ddq_add_8888] + lea r10,[byte64_len_to_mask_table] + mov r15,r13 + sub r15,192 + kmovq k1,[r15*8+r10] + vextracti32x4 xmm2,zmm5,3 + vpshufb zmm0,zmm0,zmm29 + vpshufb zmm3,zmm3,zmm29 + vpshufb zmm4,zmm4,zmm29 + vpshufb zmm5,zmm5,zmm29 + vmovdqu8 zmm6,ZMMWORD[r11*1+r9] + vmovdqu8 zmm7,ZMMWORD[64+r11*1+r9] + vmovdqu8 zmm10,ZMMWORD[128+r11*1+r9] + vmovdqu8 zmm11{k1}{z},[192+r11*1+r9] + vbroadcastf64x2 zmm15,ZMMWORD[rcx] + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm15 + vpxorq zmm4,zmm4,zmm15 + vpxorq zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[16+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[32+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[48+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[64+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[80+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[96+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[112+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[128+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[144+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[160+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[176+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[192+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[208+rcx] + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm3,zmm3,zmm15 + vaesenc zmm4,zmm4,zmm15 + vaesenc zmm5,zmm5,zmm15 + vbroadcastf64x2 zmm15,ZMMWORD[224+rcx] + vaesenclast zmm0,zmm0,zmm15 + vaesenclast zmm3,zmm3,zmm15 + vaesenclast zmm4,zmm4,zmm15 + vaesenclast zmm5,zmm5,zmm15 + vpxorq zmm0,zmm0,zmm6 + vpxorq zmm3,zmm3,zmm7 + vpxorq zmm4,zmm4,zmm10 + vpxorq zmm5,zmm5,zmm11 + vextracti32x4 xmm12,zmm5,3 + mov r10,QWORD[120+rbp] + vmovdqu8 ZMMWORD[r11*1+r10],zmm0 + vmovdqu8 ZMMWORD[64+r11*1+r10],zmm3 + vmovdqu8 ZMMWORD[128+r11*1+r10],zmm4 + vmovdqu8 ZMMWORD[192+r11*1+r10]{k1},zmm5 + vmovdqu8 zmm5{k1}{z},zmm5 + vpshufb zmm6,zmm6,zmm29 + vpshufb zmm7,zmm7,zmm29 + vpshufb zmm10,zmm10,zmm29 + vpshufb zmm11,zmm11,zmm29 + vextracti32x4 xmm13,zmm11,3 + sub r13,16 * (16 - 1) +$L$_small_initial_partial_block_982: + + + + + + + + + mov QWORD[r8],r13 + vmovdqu64 XMMWORD[16+rdx],xmm12 + vpxorq zmm6,zmm6,zmm14 + vmovdqu64 zmm20,ZMMWORD[112+rdx] + vpclmulqdq zmm0,zmm6,zmm20,0x11 + vpclmulqdq zmm3,zmm6,zmm20,0x00 + vpclmulqdq zmm4,zmm6,zmm20,0x01 + vpclmulqdq zmm5,zmm6,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[176+rdx] + vpclmulqdq zmm15,zmm7,zmm20,0x11 + vpclmulqdq zmm16,zmm7,zmm20,0x00 + vpclmulqdq zmm17,zmm7,zmm20,0x01 + vpclmulqdq zmm19,zmm7,zmm20,0x10 + vmovdqu64 zmm20,ZMMWORD[240+rdx] + vpclmulqdq zmm6,zmm10,zmm20,0x11 + vpclmulqdq zmm7,zmm10,zmm20,0x00 + vpternlogq zmm15,zmm6,zmm0,0x96 + vpternlogq zmm16,zmm7,zmm3,0x96 + vpclmulqdq zmm6,zmm10,zmm20,0x01 + vpclmulqdq zmm7,zmm10,zmm20,0x10 + vpternlogq zmm17,zmm6,zmm4,0x96 + vpternlogq zmm19,zmm7,zmm5,0x96 + vmovdqu64 ymm20,YMMWORD[304+rdx] + vinserti64x2 zmm20,zmm20,ZMMWORD[336+rdx],2 + vpclmulqdq zmm4,zmm11,zmm20,0x01 + vpclmulqdq zmm5,zmm11,zmm20,0x10 + vpclmulqdq zmm0,zmm11,zmm20,0x11 + vpclmulqdq zmm3,zmm11,zmm20,0x00 + + vpxorq zmm4,zmm4,zmm17 + vpxorq zmm5,zmm5,zmm19 + vpxorq zmm0,zmm0,zmm15 + vpxorq zmm3,zmm3,zmm16 + + vpxorq zmm4,zmm4,zmm5 + vpsrldq zmm17,zmm4,8 + vpslldq zmm19,zmm4,8 + vpxorq zmm0,zmm0,zmm17 + vpxorq zmm3,zmm3,zmm19 + vextracti64x4 ymm17,zmm0,1 + vpxorq ymm0,ymm0,ymm17 + vextracti32x4 xmm17,ymm0,1 + vpxorq xmm0,xmm0,xmm17 + vextracti64x4 ymm19,zmm3,1 + vpxorq ymm3,ymm3,ymm19 + vextracti32x4 xmm19,ymm3,1 + vpxorq xmm3,xmm3,xmm19 + vmovdqa64 xmm20,XMMWORD[POLY2] + + + vpclmulqdq xmm4,xmm20,xmm3,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm4,xmm3,xmm4 + + + vpclmulqdq xmm5,xmm20,xmm4,0x00 + vpsrldq xmm5,xmm5,4 + vpclmulqdq xmm14,xmm20,xmm4,0x10 + vpslldq xmm14,xmm14,4 + vpternlogq xmm14,xmm5,xmm0,0x96 + +$L$_small_initial_compute_done_982: + vpxorq xmm14,xmm14,xmm13 +$L$_after_reduction_982: +$L$_small_initial_blocks_encrypted_966: +$L$_ghash_done_821: + vmovdqu64 XMMWORD[rdx],xmm2 + vmovdqu64 XMMWORD[64+rdx],xmm14 +$L$_enc_dec_done_821: + jmp NEAR $L$exit_gcm_decrypt +$L$exit_gcm_decrypt: + cmp QWORD[112+rbp],256 + jbe NEAR $L$skip_hkeys_cleanup_983 + vpxor xmm0,xmm0,xmm0 + vmovdqa64 ZMMWORD[rsp],zmm0 + vmovdqa64 ZMMWORD[64+rsp],zmm0 + vmovdqa64 ZMMWORD[128+rsp],zmm0 + vmovdqa64 ZMMWORD[192+rsp],zmm0 + vmovdqa64 ZMMWORD[256+rsp],zmm0 + vmovdqa64 ZMMWORD[320+rsp],zmm0 + vmovdqa64 ZMMWORD[384+rsp],zmm0 + vmovdqa64 ZMMWORD[448+rsp],zmm0 + vmovdqa64 ZMMWORD[512+rsp],zmm0 + vmovdqa64 ZMMWORD[576+rsp],zmm0 + vmovdqa64 ZMMWORD[640+rsp],zmm0 + vmovdqa64 ZMMWORD[704+rsp],zmm0 +$L$skip_hkeys_cleanup_983: + vzeroupper + vmovdqu xmm15,XMMWORD[((-16))+rbp] + vmovdqu xmm14,XMMWORD[((-32))+rbp] + vmovdqu xmm13,XMMWORD[((-48))+rbp] + vmovdqu xmm12,XMMWORD[((-64))+rbp] + vmovdqu xmm11,XMMWORD[((-80))+rbp] + vmovdqu xmm10,XMMWORD[((-96))+rbp] + vmovdqu xmm9,XMMWORD[((-112))+rbp] + vmovdqu xmm8,XMMWORD[((-128))+rbp] + vmovdqu xmm7,XMMWORD[((-144))+rbp] + vmovdqu xmm6,XMMWORD[((-160))+rbp] + lea rsp,[8+rbp] + pop rsi + + pop rdi + + pop r15 + + pop r14 + + pop r13 + + pop r12 + + pop rbp + + pop rbx + + DB 0F3h,0C3h ;repret +$L$decrypt_seh_end: + + +global ossl_aes_gcm_finalize_avx512 + +ALIGN 32 +ossl_aes_gcm_finalize_avx512: + +DB 243,15,30,250 + vmovdqu xmm2,XMMWORD[336+rcx] + vmovdqu xmm3,XMMWORD[32+rcx] + vmovdqu xmm4,XMMWORD[64+rcx] + + + cmp rdx,0 + je NEAR $L$_partial_done_984 + + vpclmulqdq xmm0,xmm4,xmm2,0x11 + vpclmulqdq xmm16,xmm4,xmm2,0x00 + vpclmulqdq xmm17,xmm4,xmm2,0x01 + vpclmulqdq xmm4,xmm4,xmm2,0x10 + vpxorq xmm4,xmm4,xmm17 + + vpsrldq xmm17,xmm4,8 + vpslldq xmm4,xmm4,8 + vpxorq xmm0,xmm0,xmm17 + vpxorq xmm4,xmm4,xmm16 + + + + vmovdqu64 xmm17,XMMWORD[POLY2] + + vpclmulqdq xmm16,xmm17,xmm4,0x01 + vpslldq xmm16,xmm16,8 + vpxorq xmm4,xmm4,xmm16 + + + + vpclmulqdq xmm16,xmm17,xmm4,0x00 + vpsrldq xmm16,xmm16,4 + vpclmulqdq xmm4,xmm17,xmm4,0x10 + vpslldq xmm4,xmm4,4 + + vpternlogq xmm4,xmm0,xmm16,0x96 + +$L$_partial_done_984: + vmovq xmm5,QWORD[56+rcx] + vpinsrq xmm5,xmm5,QWORD[48+rcx],1 + vpsllq xmm5,xmm5,3 + + vpxor xmm4,xmm4,xmm5 + + vpclmulqdq xmm0,xmm4,xmm2,0x11 + vpclmulqdq xmm16,xmm4,xmm2,0x00 + vpclmulqdq xmm17,xmm4,xmm2,0x01 + vpclmulqdq xmm4,xmm4,xmm2,0x10 + vpxorq xmm4,xmm4,xmm17 + + vpsrldq xmm17,xmm4,8 + vpslldq xmm4,xmm4,8 + vpxorq xmm0,xmm0,xmm17 + vpxorq xmm4,xmm4,xmm16 + + + + vmovdqu64 xmm17,XMMWORD[POLY2] + + vpclmulqdq xmm16,xmm17,xmm4,0x01 + vpslldq xmm16,xmm16,8 + vpxorq xmm4,xmm4,xmm16 + + + + vpclmulqdq xmm16,xmm17,xmm4,0x00 + vpsrldq xmm16,xmm16,4 + vpclmulqdq xmm4,xmm17,xmm4,0x10 + vpslldq xmm4,xmm4,4 + + vpternlogq xmm4,xmm0,xmm16,0x96 + + vpshufb xmm4,xmm4,XMMWORD[SHUF_MASK] + vpxor xmm3,xmm3,xmm4 + +$L$_return_T_984: + vmovdqu XMMWORD[64+rcx],xmm3 +$L$abort_finalize: + DB 0F3h,0C3h ;repret + + +global ossl_gcm_gmult_avx512 + + +ALIGN 32 +ossl_gcm_gmult_avx512: + +DB 243,15,30,250 + vmovdqu64 xmm1,XMMWORD[rcx] + vmovdqu64 xmm2,XMMWORD[336+rdx] + + vpclmulqdq xmm3,xmm1,xmm2,0x11 + vpclmulqdq xmm4,xmm1,xmm2,0x00 + vpclmulqdq xmm5,xmm1,xmm2,0x01 + vpclmulqdq xmm1,xmm1,xmm2,0x10 + vpxorq xmm1,xmm1,xmm5 + + vpsrldq xmm5,xmm1,8 + vpslldq xmm1,xmm1,8 + vpxorq xmm3,xmm3,xmm5 + vpxorq xmm1,xmm1,xmm4 + + + + vmovdqu64 xmm5,XMMWORD[POLY2] + + vpclmulqdq xmm4,xmm5,xmm1,0x01 + vpslldq xmm4,xmm4,8 + vpxorq xmm1,xmm1,xmm4 + + + + vpclmulqdq xmm4,xmm5,xmm1,0x00 + vpsrldq xmm4,xmm4,4 + vpclmulqdq xmm1,xmm5,xmm1,0x10 + vpslldq xmm1,xmm1,4 + + vpternlogq xmm1,xmm3,xmm4,0x96 + + vmovdqu64 XMMWORD[rcx],xmm1 + vzeroupper +$L$abort_gmult: + DB 0F3h,0C3h ;repret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$setiv_seh_begin wrt ..imagebase + DD $L$setiv_seh_end wrt ..imagebase + DD $L$setiv_seh_info wrt ..imagebase + + DD $L$ghash_seh_begin wrt ..imagebase + DD $L$ghash_seh_end wrt ..imagebase + DD $L$ghash_seh_info wrt ..imagebase + + DD $L$encrypt_seh_begin wrt ..imagebase + DD $L$encrypt_seh_end wrt ..imagebase + DD $L$encrypt_seh_info wrt ..imagebase + + DD $L$decrypt_seh_begin wrt ..imagebase + DD $L$decrypt_seh_end wrt ..imagebase + DD $L$decrypt_seh_info wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$setiv_seh_info: +DB 1 +DB $L$setiv_seh_prolog_end-$L$setiv_seh_begin +DB 31 + +DB 165 +DB $L$setiv_seh_save_xmm15-$L$setiv_seh_begin +DB 248 + DW 9 +DB $L$setiv_seh_save_xmm14-$L$setiv_seh_begin +DB 232 + DW 8 +DB $L$setiv_seh_save_xmm13-$L$setiv_seh_begin +DB 216 + DW 7 +DB $L$setiv_seh_save_xmm12-$L$setiv_seh_begin +DB 200 + DW 6 +DB $L$setiv_seh_save_xmm11-$L$setiv_seh_begin +DB 184 + DW 5 +DB $L$setiv_seh_save_xmm10-$L$setiv_seh_begin +DB 168 + DW 4 +DB $L$setiv_seh_save_xmm9-$L$setiv_seh_begin +DB 152 + DW 3 +DB $L$setiv_seh_save_xmm8-$L$setiv_seh_begin +DB 136 + DW 2 +DB $L$setiv_seh_save_xmm7-$L$setiv_seh_begin +DB 120 + DW 1 +DB $L$setiv_seh_save_xmm6-$L$setiv_seh_begin +DB 104 + DW 0 + +DB $L$setiv_seh_setfp-$L$setiv_seh_begin +DB 3 + + +DB $L$setiv_seh_allocstack_xmm-$L$setiv_seh_begin +DB 1 + DW 21 +DB $L$setiv_seh_push_rsi-$L$setiv_seh_begin +DB 96 +DB $L$setiv_seh_push_rdi-$L$setiv_seh_begin +DB 112 +DB $L$setiv_seh_push_r15-$L$setiv_seh_begin +DB 240 +DB $L$setiv_seh_push_r14-$L$setiv_seh_begin +DB 224 +DB $L$setiv_seh_push_r13-$L$setiv_seh_begin +DB 208 +DB $L$setiv_seh_push_r12-$L$setiv_seh_begin +DB 192 +DB $L$setiv_seh_push_rbp-$L$setiv_seh_begin +DB 80 +DB $L$setiv_seh_push_rbx-$L$setiv_seh_begin +DB 48 +ALIGN 8 +$L$ghash_seh_info: +DB 1 +DB $L$ghash_seh_prolog_end-$L$ghash_seh_begin +DB 31 + +DB 165 +DB $L$ghash_seh_save_xmm15-$L$ghash_seh_begin +DB 248 + DW 9 +DB $L$ghash_seh_save_xmm14-$L$ghash_seh_begin +DB 232 + DW 8 +DB $L$ghash_seh_save_xmm13-$L$ghash_seh_begin +DB 216 + DW 7 +DB $L$ghash_seh_save_xmm12-$L$ghash_seh_begin +DB 200 + DW 6 +DB $L$ghash_seh_save_xmm11-$L$ghash_seh_begin +DB 184 + DW 5 +DB $L$ghash_seh_save_xmm10-$L$ghash_seh_begin +DB 168 + DW 4 +DB $L$ghash_seh_save_xmm9-$L$ghash_seh_begin +DB 152 + DW 3 +DB $L$ghash_seh_save_xmm8-$L$ghash_seh_begin +DB 136 + DW 2 +DB $L$ghash_seh_save_xmm7-$L$ghash_seh_begin +DB 120 + DW 1 +DB $L$ghash_seh_save_xmm6-$L$ghash_seh_begin +DB 104 + DW 0 + +DB $L$ghash_seh_setfp-$L$ghash_seh_begin +DB 3 + + +DB $L$ghash_seh_allocstack_xmm-$L$ghash_seh_begin +DB 1 + DW 21 +DB $L$ghash_seh_push_rsi-$L$ghash_seh_begin +DB 96 +DB $L$ghash_seh_push_rdi-$L$ghash_seh_begin +DB 112 +DB $L$ghash_seh_push_r15-$L$ghash_seh_begin +DB 240 +DB $L$ghash_seh_push_r14-$L$ghash_seh_begin +DB 224 +DB $L$ghash_seh_push_r13-$L$ghash_seh_begin +DB 208 +DB $L$ghash_seh_push_r12-$L$ghash_seh_begin +DB 192 +DB $L$ghash_seh_push_rbp-$L$ghash_seh_begin +DB 80 +DB $L$ghash_seh_push_rbx-$L$ghash_seh_begin +DB 48 +ALIGN 8 +$L$encrypt_seh_info: +DB 1 +DB $L$encrypt_seh_prolog_end-$L$encrypt_seh_begin +DB 31 + +DB 165 +DB $L$encrypt_seh_save_xmm15-$L$encrypt_seh_begin +DB 248 + DW 9 +DB $L$encrypt_seh_save_xmm14-$L$encrypt_seh_begin +DB 232 + DW 8 +DB $L$encrypt_seh_save_xmm13-$L$encrypt_seh_begin +DB 216 + DW 7 +DB $L$encrypt_seh_save_xmm12-$L$encrypt_seh_begin +DB 200 + DW 6 +DB $L$encrypt_seh_save_xmm11-$L$encrypt_seh_begin +DB 184 + DW 5 +DB $L$encrypt_seh_save_xmm10-$L$encrypt_seh_begin +DB 168 + DW 4 +DB $L$encrypt_seh_save_xmm9-$L$encrypt_seh_begin +DB 152 + DW 3 +DB $L$encrypt_seh_save_xmm8-$L$encrypt_seh_begin +DB 136 + DW 2 +DB $L$encrypt_seh_save_xmm7-$L$encrypt_seh_begin +DB 120 + DW 1 +DB $L$encrypt_seh_save_xmm6-$L$encrypt_seh_begin +DB 104 + DW 0 + +DB $L$encrypt_seh_setfp-$L$encrypt_seh_begin +DB 3 + + +DB $L$encrypt_seh_allocstack_xmm-$L$encrypt_seh_begin +DB 1 + DW 21 +DB $L$encrypt_seh_push_rsi-$L$encrypt_seh_begin +DB 96 +DB $L$encrypt_seh_push_rdi-$L$encrypt_seh_begin +DB 112 +DB $L$encrypt_seh_push_r15-$L$encrypt_seh_begin +DB 240 +DB $L$encrypt_seh_push_r14-$L$encrypt_seh_begin +DB 224 +DB $L$encrypt_seh_push_r13-$L$encrypt_seh_begin +DB 208 +DB $L$encrypt_seh_push_r12-$L$encrypt_seh_begin +DB 192 +DB $L$encrypt_seh_push_rbp-$L$encrypt_seh_begin +DB 80 +DB $L$encrypt_seh_push_rbx-$L$encrypt_seh_begin +DB 48 +ALIGN 8 +$L$decrypt_seh_info: +DB 1 +DB $L$decrypt_seh_prolog_end-$L$decrypt_seh_begin +DB 31 + +DB 165 +DB $L$decrypt_seh_save_xmm15-$L$decrypt_seh_begin +DB 248 + DW 9 +DB $L$decrypt_seh_save_xmm14-$L$decrypt_seh_begin +DB 232 + DW 8 +DB $L$decrypt_seh_save_xmm13-$L$decrypt_seh_begin +DB 216 + DW 7 +DB $L$decrypt_seh_save_xmm12-$L$decrypt_seh_begin +DB 200 + DW 6 +DB $L$decrypt_seh_save_xmm11-$L$decrypt_seh_begin +DB 184 + DW 5 +DB $L$decrypt_seh_save_xmm10-$L$decrypt_seh_begin +DB 168 + DW 4 +DB $L$decrypt_seh_save_xmm9-$L$decrypt_seh_begin +DB 152 + DW 3 +DB $L$decrypt_seh_save_xmm8-$L$decrypt_seh_begin +DB 136 + DW 2 +DB $L$decrypt_seh_save_xmm7-$L$decrypt_seh_begin +DB 120 + DW 1 +DB $L$decrypt_seh_save_xmm6-$L$decrypt_seh_begin +DB 104 + DW 0 + +DB $L$decrypt_seh_setfp-$L$decrypt_seh_begin +DB 3 + + +DB $L$decrypt_seh_allocstack_xmm-$L$decrypt_seh_begin +DB 1 + DW 21 +DB $L$decrypt_seh_push_rsi-$L$decrypt_seh_begin +DB 96 +DB $L$decrypt_seh_push_rdi-$L$decrypt_seh_begin +DB 112 +DB $L$decrypt_seh_push_r15-$L$decrypt_seh_begin +DB 240 +DB $L$decrypt_seh_push_r14-$L$decrypt_seh_begin +DB 224 +DB $L$decrypt_seh_push_r13-$L$decrypt_seh_begin +DB 208 +DB $L$decrypt_seh_push_r12-$L$decrypt_seh_begin +DB 192 +DB $L$decrypt_seh_push_rbp-$L$decrypt_seh_begin +DB 80 +DB $L$decrypt_seh_push_rbx-$L$decrypt_seh_begin +DB 48 +section .rdata rdata align=16 +ALIGN 16 +POLY: DQ 0x0000000000000001,0xC200000000000000 + +ALIGN 64 +POLY2: + DQ 0x00000001C2000000,0xC200000000000000 + DQ 0x00000001C2000000,0xC200000000000000 + DQ 0x00000001C2000000,0xC200000000000000 + DQ 0x00000001C2000000,0xC200000000000000 + +ALIGN 16 +TWOONE: DQ 0x0000000000000001,0x0000000100000000 + + + +ALIGN 64 +SHUF_MASK: + DQ 0x08090A0B0C0D0E0F,0x0001020304050607 + DQ 0x08090A0B0C0D0E0F,0x0001020304050607 + DQ 0x08090A0B0C0D0E0F,0x0001020304050607 + DQ 0x08090A0B0C0D0E0F,0x0001020304050607 + +ALIGN 16 +SHIFT_MASK: + DQ 0x0706050403020100,0x0f0e0d0c0b0a0908 + +ALL_F: + DQ 0xffffffffffffffff,0xffffffffffffffff + +ZERO: + DQ 0x0000000000000000,0x0000000000000000 + +ALIGN 16 +ONE: + DQ 0x0000000000000001,0x0000000000000000 + +ALIGN 16 +ONEf: + DQ 0x0000000000000000,0x0100000000000000 + +ALIGN 64 +ddq_add_1234: + DQ 0x0000000000000001,0x0000000000000000 + DQ 0x0000000000000002,0x0000000000000000 + DQ 0x0000000000000003,0x0000000000000000 + DQ 0x0000000000000004,0x0000000000000000 + +ALIGN 64 +ddq_add_5678: + DQ 0x0000000000000005,0x0000000000000000 + DQ 0x0000000000000006,0x0000000000000000 + DQ 0x0000000000000007,0x0000000000000000 + DQ 0x0000000000000008,0x0000000000000000 + +ALIGN 64 +ddq_add_4444: + DQ 0x0000000000000004,0x0000000000000000 + DQ 0x0000000000000004,0x0000000000000000 + DQ 0x0000000000000004,0x0000000000000000 + DQ 0x0000000000000004,0x0000000000000000 + +ALIGN 64 +ddq_add_8888: + DQ 0x0000000000000008,0x0000000000000000 + DQ 0x0000000000000008,0x0000000000000000 + DQ 0x0000000000000008,0x0000000000000000 + DQ 0x0000000000000008,0x0000000000000000 + +ALIGN 64 +ddq_addbe_1234: + DQ 0x0000000000000000,0x0100000000000000 + DQ 0x0000000000000000,0x0200000000000000 + DQ 0x0000000000000000,0x0300000000000000 + DQ 0x0000000000000000,0x0400000000000000 + +ALIGN 64 +ddq_addbe_4444: + DQ 0x0000000000000000,0x0400000000000000 + DQ 0x0000000000000000,0x0400000000000000 + DQ 0x0000000000000000,0x0400000000000000 + DQ 0x0000000000000000,0x0400000000000000 + +ALIGN 64 +byte_len_to_mask_table: + DW 0x0000,0x0001,0x0003,0x0007 + DW 0x000f,0x001f,0x003f,0x007f + DW 0x00ff,0x01ff,0x03ff,0x07ff + DW 0x0fff,0x1fff,0x3fff,0x7fff + DW 0xffff + +ALIGN 64 +byte64_len_to_mask_table: + DQ 0x0000000000000000,0x0000000000000001 + DQ 0x0000000000000003,0x0000000000000007 + DQ 0x000000000000000f,0x000000000000001f + DQ 0x000000000000003f,0x000000000000007f + DQ 0x00000000000000ff,0x00000000000001ff + DQ 0x00000000000003ff,0x00000000000007ff + DQ 0x0000000000000fff,0x0000000000001fff + DQ 0x0000000000003fff,0x0000000000007fff + DQ 0x000000000000ffff,0x000000000001ffff + DQ 0x000000000003ffff,0x000000000007ffff + DQ 0x00000000000fffff,0x00000000001fffff + DQ 0x00000000003fffff,0x00000000007fffff + DQ 0x0000000000ffffff,0x0000000001ffffff + DQ 0x0000000003ffffff,0x0000000007ffffff + DQ 0x000000000fffffff,0x000000001fffffff + DQ 0x000000003fffffff,0x000000007fffffff + DQ 0x00000000ffffffff,0x00000001ffffffff + DQ 0x00000003ffffffff,0x00000007ffffffff + DQ 0x0000000fffffffff,0x0000001fffffffff + DQ 0x0000003fffffffff,0x0000007fffffffff + DQ 0x000000ffffffffff,0x000001ffffffffff + DQ 0x000003ffffffffff,0x000007ffffffffff + DQ 0x00000fffffffffff,0x00001fffffffffff + DQ 0x00003fffffffffff,0x00007fffffffffff + DQ 0x0000ffffffffffff,0x0001ffffffffffff + DQ 0x0003ffffffffffff,0x0007ffffffffffff + DQ 0x000fffffffffffff,0x001fffffffffffff + DQ 0x003fffffffffffff,0x007fffffffffffff + DQ 0x00ffffffffffffff,0x01ffffffffffffff + DQ 0x03ffffffffffffff,0x07ffffffffffffff + DQ 0x0fffffffffffffff,0x1fffffffffffffff + DQ 0x3fffffffffffffff,0x7fffffffffffffff + DQ 0xffffffffffffffff diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/aesni-gcm-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/aesni-gcm-x86_64.nasm index cbc06ca5fae0..0db04b0634cf 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/aesni-gcm-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/aesni-gcm-x86_64.nasm @@ -851,6 +851,7 @@ $L$gcm_enc_abort: DB 0F3h,0C3h ;repret $L$SEH_end_aesni_gcm_encrypt: +section .rdata rdata align=64 ALIGN 64 $L$bswap_mask: DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 @@ -866,6 +867,7 @@ DB 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108 DB 101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82 DB 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 DB 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +section .text ALIGN 64 EXTERN __imp_RtlVirtualUnwind diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/ghash-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/ghash-x86_64.nasm index e70f90841bcb..91cd042a33f2 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/ghash-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/ghash-x86_64.nasm @@ -723,6 +723,7 @@ global gcm_init_clmul ALIGN 16 gcm_init_clmul: +DB 243,15,30,250 $L$_init_clmul: $L$SEH_begin_gcm_init_clmul: @@ -1354,6 +1355,7 @@ global gcm_init_avx ALIGN 32 gcm_init_avx: +DB 243,15,30,250 $L$SEH_begin_gcm_init_avx: DB 0x48,0x83,0xec,0x18 @@ -1879,6 +1881,7 @@ $L$SEH_end_gcm_ghash_avx: DB 0F3h,0C3h ;repret +section .rdata rdata align=64 ALIGN 64 $L$bswap_mask: DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 @@ -1935,6 +1938,7 @@ DB 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 DB 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 DB 114,103,62,0 ALIGN 64 +section .text EXTERN __imp_RtlVirtualUnwind ALIGN 16 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/keccak1600-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/keccak1600-x86_64.nasm index af4b87d68b21..0264f5373491 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/keccak1600-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/keccak1600-x86_64.nasm @@ -436,6 +436,7 @@ $L$SEH_begin_SHA3_squeeze: mov rsi,rdx mov rdx,r8 mov rcx,r9 + mov r8,QWORD[40+rsp] @@ -447,10 +448,12 @@ $L$SEH_begin_SHA3_squeeze: shr rcx,3 - mov r8,rdi + mov r9,rdi mov r12,rsi mov r13,rdx mov r14,rcx + bt r8d,0 + jc NEAR $L$next_block jmp NEAR $L$oop_squeeze ALIGN 32 @@ -458,8 +461,8 @@ $L$oop_squeeze: cmp r13,8 jb NEAR $L$tail_squeeze - mov rax,QWORD[r8] - lea r8,[8+r8] + mov rax,QWORD[r9] + lea r9,[8+r9] mov QWORD[r12],rax lea r12,[8+r12] sub r13,8 @@ -467,14 +470,14 @@ $L$oop_squeeze: sub rcx,1 jnz NEAR $L$oop_squeeze - +$L$next_block: call KeccakF1600 - mov r8,rdi + mov r9,rdi mov rcx,r14 jmp NEAR $L$oop_squeeze $L$tail_squeeze: - mov rsi,r8 + mov rsi,r9 mov rdi,r12 mov rcx,r13 DB 0xf3,0xa4 @@ -491,6 +494,7 @@ $L$done_squeeze: DB 0F3h,0C3h ;repret $L$SEH_end_SHA3_squeeze: +section .rdata rdata align=256 ALIGN 256 DQ 0,0,0,0,0,0,0,0 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-mb-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-mb-x86_64.nasm index 9018065f8dde..ac1470cbb798 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-mb-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-mb-x86_64.nasm @@ -7419,7 +7419,7 @@ $L$epilogue_avx2: DB 0F3h,0C3h ;repret $L$SEH_end_sha1_multi_block_avx2: - +section .rdata rdata align=256 ALIGN 256 DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999 DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999 @@ -7438,6 +7438,7 @@ DB 32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120 DB 56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77 DB 83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110 DB 115,115,108,46,111,114,103,62,0 +section .text EXTERN __imp_RtlVirtualUnwind ALIGN 16 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-x86_64.nasm index 9d1f10e1ee69..912918e04449 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-x86_64.nasm @@ -5545,6 +5545,7 @@ $L$epilogue_avx2: DB 0F3h,0C3h ;repret $L$SEH_end_sha1_block_data_order_avx2: +section .rdata rdata align=64 ALIGN 64 K_XX_XX: DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999 @@ -5558,6 +5559,7 @@ K_XX_XX: DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f DB 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 +section .text DB 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115 DB 102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44 DB 32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-mb-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-mb-x86_64.nasm index 58c00d6b92c0..cfee9855f63e 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-mb-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-mb-x86_64.nasm @@ -7964,6 +7964,7 @@ $L$epilogue_avx2: DB 0F3h,0C3h ;repret $L$SEH_end_sha256_multi_block_avx2: +section .rdata rdata align=256 ALIGN 256 K256: DD 1116352408,1116352408,1116352408,1116352408 @@ -8119,6 +8120,7 @@ DB 99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114 DB 32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71 DB 65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112 DB 101,110,115,115,108,46,111,114,103,62,0 +section .text EXTERN __imp_RtlVirtualUnwind ALIGN 16 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-x86_64.nasm index 8238c4e4636a..8cb6a3178bcf 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-x86_64.nasm @@ -1744,6 +1744,7 @@ $L$epilogue: DB 0F3h,0C3h ;repret $L$SEH_end_sha256_block_data_order: +section .rdata rdata align=64 ALIGN 64 K256: @@ -1791,6 +1792,7 @@ DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54 DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 DB 111,114,103,62,0 +section .text ALIGN 64 sha256_block_data_order_shaext: diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha512-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha512-x86_64.nasm index 5ddba53d1c51..eab959eaa721 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha512-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha512-x86_64.nasm @@ -1742,6 +1742,7 @@ $L$epilogue: DB 0F3h,0C3h ;repret $L$SEH_end_sha512_block_data_order: +section .rdata rdata align=64 ALIGN 64 K512: @@ -1833,6 +1834,7 @@ DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54 DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 DB 111,114,103,62,0 +section .text ALIGN 64 sha512_block_data_order_xop: diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/crypto/params_idx.c b/CryptoPkg/Library/OpensslLib/OpensslGen/crypto/params_idx.c new file mode 100644 index 000000000000..fd9b9ae658f5 --- /dev/null +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/crypto/params_idx.c @@ -0,0 +1,3229 @@ +/* + * WARNING: do not edit! + * Generated by Makefile from crypto/params_idx.c.in + * + * Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + + +#include "internal/e_os.h" +#include "internal/param_names.h" +#include + +/* Machine generated TRIE -- generated by util/perl/OpenSSL/paramnames.pm */ +int ossl_param_find_pidx(const char *s) +{ + switch(s[0]) { + default: + break; + case 'a': + switch(s[1]) { + default: + break; + case 'c': + if (strcmp("vp-info", s + 2) == 0) + return PIDX_KDF_PARAM_X942_ACVPINFO; + break; + case 'd': + switch(s[2]) { + default: + break; + case '\0': + return PIDX_KDF_PARAM_ARGON2_AD; + } + break; + case 'e': + if (strcmp("ad", s + 2) == 0) + return PIDX_CIPHER_PARAM_AEAD; + break; + case 'l': + switch(s[2]) { + default: + break; + case 'g': + switch(s[3]) { + default: + break; + case '_': + if (strcmp("id_param", s + 4) == 0) + return PIDX_CIPHER_PARAM_ALGORITHM_ID_PARAMS_OLD; + break; + case 'i': + if (strcmp("d-absent", s + 4) == 0) + return PIDX_DIGEST_PARAM_ALGID_ABSENT; + break; + case 'o': + switch(s[4]) { + default: + break; + case 'r': + switch(s[5]) { + default: + break; + case 'i': + switch(s[6]) { + default: + break; + case 't': + switch(s[7]) { + default: + break; + case 'h': + switch(s[8]) { + default: + break; + case 'm': + switch(s[9]) { + default: + break; + case '-': + switch(s[10]) { + default: + break; + case 'i': + switch(s[11]) { + default: + break; + case 'd': + switch(s[12]) { + default: + break; + case '-': + if (strcmp("params", s + 13) == 0) + return PIDX_ALG_PARAM_ALGORITHM_ID_PARAMS; + break; + case '\0': + return PIDX_ALG_PARAM_ALGORITHM_ID; + } + } + } + } + } + } + } + } + } + } + break; + case 'i': + if (strcmp("as", s + 3) == 0) + return PIDX_STORE_PARAM_ALIAS; + } + break; + case '\0': + return PIDX_PKEY_PARAM_EC_A; + } + break; + case 'b': + switch(s[1]) { + default: + break; + case 'a': + if (strcmp("sis-type", s + 2) == 0) + return PIDX_PKEY_PARAM_EC_CHAR2_TYPE; + break; + case 'i': + if (strcmp("ts", s + 2) == 0) + return PIDX_PKEY_PARAM_BITS; + break; + case 'l': + switch(s[2]) { + default: + break; + case 'o': + switch(s[3]) { + default: + break; + case 'c': + switch(s[4]) { + default: + break; + case 'k': + switch(s[5]) { + default: + break; + case '-': + if (strcmp("size", s + 6) == 0) + return PIDX_MAC_PARAM_BLOCK_SIZE; + break; + case '_': + if (strcmp("padding", s + 6) == 0) + return PIDX_LIBSSL_RECORD_LAYER_PARAM_BLOCK_PADDING; + break; + case 's': + if (strcmp("ize", s + 6) == 0) + return PIDX_DIGEST_PARAM_BLOCK_SIZE; + } + } + } + } + break; + case 'u': + if (strcmp("ildinfo", s + 2) == 0) + return PIDX_PROV_PARAM_BUILDINFO; + break; + case '\0': + return PIDX_PKEY_PARAM_EC_B; + } + break; + case 'c': + switch(s[1]) { + default: + break; + case '-': + if (strcmp("rounds", s + 2) == 0) + return PIDX_MAC_PARAM_C_ROUNDS; + break; + case 'e': + if (strcmp("kalg", s + 2) == 0) + return PIDX_KDF_PARAM_CEK_ALG; + break; + case 'i': + if (strcmp("pher", s + 2) == 0) + return PIDX_ALG_PARAM_CIPHER; + break; + case 'o': + switch(s[2]) { + default: + break; + case 'f': + if (strcmp("actor", s + 3) == 0) + return PIDX_PKEY_PARAM_EC_COFACTOR; + break; + case 'n': + switch(s[3]) { + default: + break; + case 's': + if (strcmp("tant", s + 4) == 0) + return PIDX_KDF_PARAM_CONSTANT; + break; + case 't': + if (strcmp("ext-string", s + 4) == 0) + return PIDX_SIGNATURE_PARAM_CONTEXT_STRING; + } + } + break; + case 't': + switch(s[2]) { + default: + break; + case 's': + switch(s[3]) { + default: + break; + case '_': + if (strcmp("mode", s + 4) == 0) + return PIDX_CIPHER_PARAM_CTS_MODE; + break; + case '\0': + return PIDX_CIPHER_PARAM_CTS; + } + } + break; + case 'u': + switch(s[2]) { + default: + break; + case 's': + switch(s[3]) { + default: + break; + case 't': + switch(s[4]) { + default: + break; + case 'o': + switch(s[5]) { + default: + break; + case 'm': + switch(s[6]) { + default: + break; + case '-': + if (strcmp("iv", s + 7) == 0) + return PIDX_CIPHER_PARAM_CUSTOM_IV; + break; + case '\0': + return PIDX_MAC_PARAM_CUSTOM; + } + } + } + } + } + } + break; + case 'd': + switch(s[1]) { + default: + break; + case '-': + if (strcmp("rounds", s + 2) == 0) + return PIDX_MAC_PARAM_D_ROUNDS; + break; + case 'a': + switch(s[2]) { + default: + break; + case 't': + switch(s[3]) { + default: + break; + case 'a': + switch(s[4]) { + default: + break; + case '-': + switch(s[5]) { + default: + break; + case 's': + if (strcmp("tructure", s + 6) == 0) + return PIDX_OBJECT_PARAM_DATA_STRUCTURE; + break; + case 't': + if (strcmp("ype", s + 6) == 0) + return PIDX_OBJECT_PARAM_DATA_TYPE; + } + break; + case '\0': + return PIDX_OBJECT_PARAM_DATA; + } + } + } + break; + case 'e': + switch(s[2]) { + default: + break; + case 'c': + switch(s[3]) { + default: + break; + case 'o': + if (strcmp("ded-from-explicit", s + 4) == 0) + return PIDX_PKEY_PARAM_EC_DECODED_FROM_EXPLICIT_PARAMS; + break; + case 'r': + if (strcmp("ypt-only", s + 4) == 0) + return PIDX_CIPHER_PARAM_DECRYPT_ONLY; + } + break; + case 'f': + if (strcmp("ault-digest", s + 3) == 0) + return PIDX_PKEY_PARAM_DEFAULT_DIGEST; + break; + case 's': + if (strcmp("c", s + 3) == 0) + return PIDX_OBJECT_PARAM_DESC; + } + break; + case 'h': + if (strcmp("kem-ikm", s + 2) == 0) + return PIDX_PKEY_PARAM_DHKEM_IKM; + break; + case 'i': + switch(s[2]) { + default: + break; + case 'g': + switch(s[3]) { + default: + break; + case 'e': + switch(s[4]) { + default: + break; + case 's': + switch(s[5]) { + default: + break; + case 't': + switch(s[6]) { + default: + break; + case '-': + switch(s[7]) { + default: + break; + case 'c': + if (strcmp("heck", s + 8) == 0) + return PIDX_PKEY_PARAM_FIPS_DIGEST_CHECK; + break; + case 'n': + if (strcmp("oinit", s + 8) == 0) + return PIDX_MAC_PARAM_DIGEST_NOINIT; + break; + case 'o': + if (strcmp("neshot", s + 8) == 0) + return PIDX_MAC_PARAM_DIGEST_ONESHOT; + break; + case 'p': + if (strcmp("rops", s + 8) == 0) + return PIDX_ASYM_CIPHER_PARAM_OAEP_DIGEST_PROPS; + break; + case 's': + if (strcmp("ize", s + 8) == 0) + return PIDX_PKEY_PARAM_DIGEST_SIZE; + } + break; + case '\0': + return PIDX_STORE_PARAM_DIGEST; + } + } + } + } + break; + case 's': + if (strcmp("tid", s + 3) == 0) + return PIDX_PKEY_PARAM_DIST_ID; + } + break; + case 'r': + if (strcmp("bg-no-trunc-md", s + 2) == 0) + return PIDX_PROV_PARAM_DRBG_TRUNC_DIGEST; + break; + case 's': + if (strcmp("a-sign-disabled", s + 2) == 0) + return PIDX_PROV_PARAM_DSA_SIGN_DISABLED; + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_D; + } + break; + case 'e': + switch(s[1]) { + default: + break; + case 'a': + if (strcmp("rly_clean", s + 2) == 0) + return PIDX_KDF_PARAM_EARLY_CLEAN; + break; + case 'c': + switch(s[2]) { + default: + break; + case 'd': + switch(s[3]) { + default: + break; + case 'h': + switch(s[4]) { + default: + break; + case '-': + switch(s[5]) { + default: + break; + case 'c': + switch(s[6]) { + default: + break; + case 'o': + switch(s[7]) { + default: + break; + case 'f': + switch(s[8]) { + default: + break; + case 'a': + switch(s[9]) { + default: + break; + case 'c': + switch(s[10]) { + default: + break; + case 't': + switch(s[11]) { + default: + break; + case 'o': + switch(s[12]) { + default: + break; + case 'r': + switch(s[13]) { + default: + break; + case '-': + switch(s[14]) { + default: + break; + case 'c': + if (strcmp("heck", s + 15) == 0) + return PIDX_PROV_PARAM_ECDH_COFACTOR_CHECK; + break; + case 'm': + if (strcmp("ode", s + 15) == 0) + return PIDX_EXCHANGE_PARAM_EC_ECDH_COFACTOR_MODE; + } + } + } + } + } + } + } + } + } + } + } + } + } + break; + case 'm': + if (strcmp("s_check", s + 2) == 0) + return PIDX_KDF_PARAM_FIPS_EMS_CHECK; + break; + case 'n': + switch(s[2]) { + default: + break; + case 'c': + switch(s[3]) { + default: + break; + case 'o': + switch(s[4]) { + default: + break; + case 'd': + switch(s[5]) { + default: + break; + case 'e': + if (strcmp("d-pub-key", s + 6) == 0) + return PIDX_PKEY_PARAM_ENCODED_PUBLIC_KEY; + break; + case 'i': + if (strcmp("ng", s + 6) == 0) + return PIDX_PKEY_PARAM_EC_ENCODING; + } + } + break; + case 'r': + switch(s[4]) { + default: + break; + case 'y': + switch(s[5]) { + default: + break; + case 'p': + switch(s[6]) { + default: + break; + case 't': + switch(s[7]) { + default: + break; + case '-': + switch(s[8]) { + default: + break; + case 'c': + if (strcmp("heck", s + 9) == 0) + return PIDX_CIPHER_PARAM_FIPS_ENCRYPT_CHECK; + break; + case 'l': + if (strcmp("evel", s + 9) == 0) + return PIDX_ENCODER_PARAM_ENCRYPT_LEVEL; + } + } + } + } + } + } + break; + case 'g': + if (strcmp("ine", s + 3) == 0) + return PIDX_ALG_PARAM_ENGINE; + break; + case 't': + switch(s[3]) { + default: + break; + case 'r': + switch(s[4]) { + default: + break; + case 'o': + switch(s[5]) { + default: + break; + case 'p': + switch(s[6]) { + default: + break; + case 'y': + switch(s[7]) { + default: + break; + case '_': + if (strcmp("required", s + 8) == 0) + return PIDX_DRBG_PARAM_ENTROPY_REQUIRED; + break; + case '\0': + return PIDX_KDF_PARAM_HMACDRBG_ENTROPY; + } + } + } + } + } + } + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_E; + break; + case 'x': + if (strcmp("pect", s + 2) == 0) + return PIDX_STORE_PARAM_EXPECT; + } + break; + case 'f': + switch(s[1]) { + default: + break; + case 'i': + switch(s[2]) { + default: + break; + case 'e': + if (strcmp("ld-type", s + 3) == 0) + return PIDX_PKEY_PARAM_EC_FIELD_TYPE; + break; + case 'n': + if (strcmp("gerprint", s + 3) == 0) + return PIDX_STORE_PARAM_FINGERPRINT; + break; + case 'p': + if (strcmp("s-indicator", s + 3) == 0) + return PIDX_ALG_PARAM_FIPS_APPROVED_INDICATOR; + } + } + break; + case 'g': + switch(s[1]) { + default: + break; + case 'e': + switch(s[2]) { + default: + break; + case 'n': + switch(s[3]) { + default: + break; + case 'e': + switch(s[4]) { + default: + break; + case 'r': + switch(s[5]) { + default: + break; + case 'a': + switch(s[6]) { + default: + break; + case 't': + switch(s[7]) { + default: + break; + case 'e': + switch(s[8]) { + default: + break; + case '\0': + return PIDX_RAND_PARAM_GENERATE; + } + break; + case 'o': + if (strcmp("r", s + 8) == 0) + return PIDX_PKEY_PARAM_EC_GENERATOR; + } + } + } + } + } + } + break; + case 'i': + if (strcmp("ndex", s + 2) == 0) + return PIDX_PKEY_PARAM_FFC_GINDEX; + break; + case 'r': + switch(s[2]) { + default: + break; + case 'o': + switch(s[3]) { + default: + break; + case 'u': + switch(s[4]) { + default: + break; + case 'p': + switch(s[5]) { + default: + break; + case '-': + if (strcmp("check", s + 6) == 0) + return PIDX_PKEY_PARAM_EC_GROUP_CHECK_TYPE; + break; + case '\0': + return PIDX_PKEY_PARAM_GROUP_NAME; + } + } + } + } + break; + case '\0': + return PIDX_PKEY_PARAM_FFC_G; + } + break; + case 'h': + switch(s[1]) { + default: + break; + case 'a': + if (strcmp("s-randkey", s + 2) == 0) + return PIDX_CIPHER_PARAM_HAS_RAND_KEY; + break; + case 'i': + if (strcmp("ndex", s + 2) == 0) + return PIDX_PKEY_PARAM_FFC_H; + break; + case 'k': + switch(s[2]) { + default: + break; + case 'd': + switch(s[3]) { + default: + break; + case 'f': + switch(s[4]) { + default: + break; + case '-': + switch(s[5]) { + default: + break; + case 'd': + if (strcmp("igest-check", s + 6) == 0) + return PIDX_PROV_PARAM_HKDF_DIGEST_CHECK; + break; + case 'k': + if (strcmp("ey-check", s + 6) == 0) + return PIDX_PROV_PARAM_HKDF_KEY_CHECK; + } + } + } + } + break; + case 'm': + if (strcmp("ac-key-check", s + 2) == 0) + return PIDX_PROV_PARAM_HMAC_KEY_CHECK; + break; + case 's': + if (strcmp("_padding", s + 2) == 0) + return PIDX_LIBSSL_RECORD_LAYER_PARAM_HS_PADDING; + } + break; + case 'i': + switch(s[1]) { + default: + break; + case 'd': + switch(s[2]) { + default: + break; + case '\0': + return PIDX_KDF_PARAM_PKCS12_ID; + } + break; + case 'k': + if (strcmp("me", s + 2) == 0) + return PIDX_KEM_PARAM_IKME; + break; + case 'm': + if (strcmp("plicit-rejection", s + 2) == 0) + return PIDX_PKEY_PARAM_IMPLICIT_REJECTION; + break; + case 'n': + switch(s[2]) { + default: + break; + case 'c': + if (strcmp("lude-public", s + 3) == 0) + return PIDX_PKEY_PARAM_EC_INCLUDE_PUBLIC; + break; + case 'f': + if (strcmp("o", s + 3) == 0) + return PIDX_PASSPHRASE_PARAM_INFO; + break; + case 'p': + if (strcmp("ut-type", s + 3) == 0) + return PIDX_STORE_PARAM_INPUT_TYPE; + break; + case 's': + if (strcmp("tance", s + 3) == 0) + return PIDX_SIGNATURE_PARAM_INSTANCE; + } + break; + case 't': + switch(s[2]) { + default: + break; + case 'e': + switch(s[3]) { + default: + break; + case 'r': + switch(s[4]) { + default: + break; + case 'a': + if (strcmp("tion", s + 5) == 0) + return PIDX_GEN_PARAM_ITERATION; + break; + case '\0': + return PIDX_KDF_PARAM_ITER; + } + } + } + break; + case 'v': + switch(s[2]) { + default: + break; + case '-': + if (strcmp("generated", s + 3) == 0) + return PIDX_CIPHER_PARAM_AEAD_IV_GENERATED; + break; + case 'l': + if (strcmp("en", s + 3) == 0) + return PIDX_CIPHER_PARAM_IVLEN; + break; + case '\0': + return PIDX_MAC_PARAM_IV; + } + } + break; + case 'j': + switch(s[1]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_FFC_COFACTOR; + } + break; + case 'k': + switch(s[1]) { + default: + break; + case '1': + switch(s[2]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_EC_CHAR2_PP_K1; + } + break; + case '2': + switch(s[2]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_EC_CHAR2_PP_K2; + } + break; + case '3': + switch(s[2]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_EC_CHAR2_PP_K3; + } + break; + case 'a': + if (strcmp("t", s + 2) == 0) + return PIDX_SIGNATURE_PARAM_KAT; + break; + case 'b': + if (strcmp("kdf-key-check", s + 2) == 0) + return PIDX_PROV_PARAM_KBKDF_KEY_CHECK; + break; + case 'd': + switch(s[2]) { + default: + break; + case 'f': + switch(s[3]) { + default: + break; + case '-': + switch(s[4]) { + default: + break; + case 'd': + switch(s[5]) { + default: + break; + case 'i': + switch(s[6]) { + default: + break; + case 'g': + switch(s[7]) { + default: + break; + case 'e': + switch(s[8]) { + default: + break; + case 's': + switch(s[9]) { + default: + break; + case 't': + switch(s[10]) { + default: + break; + case '-': + if (strcmp("props", s + 11) == 0) + return PIDX_EXCHANGE_PARAM_KDF_DIGEST_PROPS; + break; + case '\0': + return PIDX_EXCHANGE_PARAM_KDF_DIGEST; + } + } + } + } + } + } + break; + case 'o': + if (strcmp("utlen", s + 5) == 0) + return PIDX_EXCHANGE_PARAM_KDF_OUTLEN; + break; + case 't': + if (strcmp("ype", s + 5) == 0) + return PIDX_EXCHANGE_PARAM_KDF_TYPE; + break; + case 'u': + if (strcmp("km", s + 5) == 0) + return PIDX_EXCHANGE_PARAM_KDF_UKM; + } + } + } + break; + case 'e': + switch(s[2]) { + default: + break; + case 'y': + switch(s[3]) { + default: + break; + case '-': + if (strcmp("check", s + 4) == 0) + return PIDX_PKEY_PARAM_FIPS_KEY_CHECK; + break; + case 'b': + if (strcmp("its", s + 4) == 0) + return PIDX_CIPHER_PARAM_RC2_KEYBITS; + break; + case 'l': + if (strcmp("en", s + 4) == 0) + return PIDX_CIPHER_PARAM_KEYLEN; + break; + case '\0': + return PIDX_MAC_PARAM_KEY; + } + } + break; + case 'm': + if (strcmp("ac-key-check", s + 2) == 0) + return PIDX_PROV_PARAM_KMAC_KEY_CHECK; + } + break; + case 'l': + switch(s[1]) { + default: + break; + case 'a': + switch(s[2]) { + default: + break; + case 'b': + if (strcmp("el", s + 3) == 0) + return PIDX_KDF_PARAM_LABEL; + break; + case 'n': + if (strcmp("es", s + 3) == 0) + return PIDX_KDF_PARAM_ARGON2_LANES; + } + } + break; + case 'm': + switch(s[1]) { + default: + break; + case 'a': + switch(s[2]) { + default: + break; + case 'c': + switch(s[3]) { + default: + break; + case 'k': + if (strcmp("ey", s + 4) == 0) + return PIDX_CIPHER_PARAM_AEAD_MAC_KEY; + break; + case 'l': + if (strcmp("en", s + 4) == 0) + return PIDX_KDF_PARAM_MAC_SIZE; + break; + case '\0': + return PIDX_ALG_PARAM_MAC; + } + break; + case 'n': + if (strcmp("datory-digest", s + 3) == 0) + return PIDX_PKEY_PARAM_MANDATORY_DIGEST; + break; + case 'x': + switch(s[3]) { + default: + break; + case '-': + if (strcmp("size", s + 4) == 0) + return PIDX_PKEY_PARAM_MAX_SIZE; + break; + case '_': + switch(s[4]) { + default: + break; + case 'a': + if (strcmp("dinlen", s + 5) == 0) + return PIDX_DRBG_PARAM_MAX_ADINLEN; + break; + case 'e': + switch(s[5]) { + default: + break; + case 'a': + if (strcmp("rly_data", s + 6) == 0) + return PIDX_LIBSSL_RECORD_LAYER_PARAM_MAX_EARLY_DATA; + break; + case 'n': + if (strcmp("tropylen", s + 6) == 0) + return PIDX_DRBG_PARAM_MAX_ENTROPYLEN; + } + break; + case 'f': + if (strcmp("rag_len", s + 5) == 0) + return PIDX_LIBSSL_RECORD_LAYER_PARAM_MAX_FRAG_LEN; + break; + case 'n': + if (strcmp("oncelen", s + 5) == 0) + return PIDX_DRBG_PARAM_MAX_NONCELEN; + break; + case 'p': + if (strcmp("erslen", s + 5) == 0) + return PIDX_DRBG_PARAM_MAX_PERSLEN; + break; + case 'r': + if (strcmp("equest", s + 5) == 0) + return PIDX_RAND_PARAM_MAX_REQUEST; + } + break; + case 'i': + if (strcmp("um_length", s + 4) == 0) + return PIDX_DRBG_PARAM_MAX_LENGTH; + break; + case 'm': + if (strcmp("em_bytes", s + 4) == 0) + return PIDX_KDF_PARAM_SCRYPT_MAXMEM; + } + } + break; + case 'e': + if (strcmp("mcost", s + 2) == 0) + return PIDX_KDF_PARAM_ARGON2_MEMCOST; + break; + case 'g': + switch(s[2]) { + default: + break; + case 'f': + switch(s[3]) { + default: + break; + case '1': + switch(s[4]) { + default: + break; + case '-': + switch(s[5]) { + default: + break; + case 'd': + if (strcmp("igest", s + 6) == 0) + return PIDX_PKEY_PARAM_MGF1_DIGEST; + break; + case 'p': + if (strcmp("roperties", s + 6) == 0) + return PIDX_PKEY_PARAM_MGF1_PROPERTIES; + } + } + break; + case '\0': + return PIDX_PKEY_PARAM_MASKGENFUNC; + } + } + break; + case 'i': + switch(s[2]) { + default: + break; + case 'c': + if (strcmp("alg", s + 3) == 0) + return PIDX_DIGEST_PARAM_MICALG; + break; + case 'n': + switch(s[3]) { + default: + break; + case '_': + switch(s[4]) { + default: + break; + case 'e': + if (strcmp("ntropylen", s + 5) == 0) + return PIDX_DRBG_PARAM_MIN_ENTROPYLEN; + break; + case 'n': + if (strcmp("oncelen", s + 5) == 0) + return PIDX_DRBG_PARAM_MIN_NONCELEN; + } + break; + case 'i': + if (strcmp("um_length", s + 4) == 0) + return PIDX_DRBG_PARAM_MIN_LENGTH; + } + } + break; + case 'o': + switch(s[2]) { + default: + break; + case 'd': + switch(s[3]) { + default: + break; + case 'e': + switch(s[4]) { + default: + break; + case '\0': + return PIDX_LIBSSL_RECORD_LAYER_PARAM_MODE; + } + break; + case 'u': + if (strcmp("le-filename", s + 4) == 0) + return PIDX_PROV_PARAM_CORE_MODULE_FILENAME; + } + } + break; + case '\0': + return PIDX_PKEY_PARAM_EC_CHAR2_M; + } + break; + case 'n': + switch(s[1]) { + default: + break; + case 'a': + if (strcmp("me", s + 2) == 0) + return PIDX_STORE_PARAM_ISSUER; + break; + case 'o': + switch(s[2]) { + default: + break; + case '-': + if (strcmp("short-mac", s + 3) == 0) + return PIDX_PROV_PARAM_NO_SHORT_MAC; + break; + case 'n': + switch(s[3]) { + default: + break; + case 'c': + switch(s[4]) { + default: + break; + case 'e': + switch(s[5]) { + default: + break; + case '-': + if (strcmp("type", s + 6) == 0) + return PIDX_SIGNATURE_PARAM_NONCE_TYPE; + break; + case '\0': + return PIDX_KDF_PARAM_HMACDRBG_NONCE; + } + } + } + } + break; + case 'u': + if (strcmp("m", s + 2) == 0) + return PIDX_CIPHER_PARAM_NUM; + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_N; + } + break; + case 'o': + switch(s[1]) { + default: + break; + case 'a': + if (strcmp("ep-label", s + 2) == 0) + return PIDX_ASYM_CIPHER_PARAM_OAEP_LABEL; + break; + case 'p': + switch(s[2]) { + default: + break; + case 'e': + switch(s[3]) { + default: + break; + case 'n': + if (strcmp("ssl-version", s + 4) == 0) + return PIDX_PROV_PARAM_CORE_VERSION; + break; + case 'r': + if (strcmp("ation", s + 4) == 0) + return PIDX_KEM_PARAM_OPERATION; + } + break; + case 't': + if (strcmp("ions", s + 3) == 0) + return PIDX_LIBSSL_RECORD_LAYER_PARAM_OPTIONS; + } + break; + case 'r': + if (strcmp("der", s + 2) == 0) + return PIDX_PKEY_PARAM_EC_ORDER; + } + break; + case 'p': + switch(s[1]) { + default: + break; + case '1': + switch(s[2]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_TEST_P1; + } + break; + case '2': + switch(s[2]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_TEST_P2; + } + break; + case 'a': + switch(s[2]) { + default: + break; + case 'd': + switch(s[3]) { + default: + break; + case '-': + switch(s[4]) { + default: + break; + case 'm': + if (strcmp("ode", s + 5) == 0) + return PIDX_PKEY_PARAM_PAD_MODE; + break; + case 't': + if (strcmp("ype", s + 5) == 0) + return PIDX_DIGEST_PARAM_PAD_TYPE; + } + break; + case 'd': + if (strcmp("ing", s + 4) == 0) + return PIDX_CIPHER_PARAM_PADDING; + break; + case '\0': + return PIDX_EXCHANGE_PARAM_PAD; + } + break; + case 'r': + switch(s[3]) { + default: + break; + case 't': + switch(s[4]) { + default: + break; + case 'y': + switch(s[5]) { + default: + break; + case 'u': + if (strcmp("-info", s + 6) == 0) + return PIDX_KDF_PARAM_X942_PARTYUINFO; + break; + case 'v': + if (strcmp("-info", s + 6) == 0) + return PIDX_KDF_PARAM_X942_PARTYVINFO; + } + } + } + break; + case 's': + if (strcmp("s", s + 3) == 0) + return PIDX_KDF_PARAM_PASSWORD; + } + break; + case 'b': + switch(s[2]) { + default: + break; + case 'i': + if (strcmp("ts", s + 3) == 0) + return PIDX_PKEY_PARAM_FFC_PBITS; + break; + case 'k': + if (strcmp("df2-lower-bound-check", s + 3) == 0) + return PIDX_PROV_PARAM_PBKDF2_LOWER_BOUND_CHECK; + } + break; + case 'c': + if (strcmp("ounter", s + 2) == 0) + return PIDX_PKEY_PARAM_FFC_PCOUNTER; + break; + case 'k': + if (strcmp("cs5", s + 2) == 0) + return PIDX_KDF_PARAM_PKCS5; + break; + case 'o': + switch(s[2]) { + default: + break; + case 'i': + if (strcmp("nt-format", s + 3) == 0) + return PIDX_PKEY_PARAM_EC_POINT_CONVERSION_FORMAT; + break; + case 't': + if (strcmp("ential", s + 3) == 0) + return PIDX_GEN_PARAM_POTENTIAL; + } + break; + case 'r': + switch(s[2]) { + default: + break; + case 'e': + switch(s[3]) { + default: + break; + case 'd': + if (strcmp("iction_resistance", s + 4) == 0) + return PIDX_DRBG_PARAM_PREDICTION_RESISTANCE; + break; + case 'f': + if (strcmp("ix", s + 4) == 0) + return PIDX_KDF_PARAM_PREFIX; + } + break; + case 'i': + switch(s[3]) { + default: + break; + case 'm': + if (strcmp("es", s + 4) == 0) + return PIDX_PKEY_PARAM_RSA_PRIMES; + break; + case 'v': + switch(s[4]) { + default: + break; + case '_': + if (strcmp("len", s + 5) == 0) + return PIDX_PKEY_PARAM_DH_PRIV_LEN; + break; + case '\0': + return PIDX_PKEY_PARAM_PRIV_KEY; + } + } + break; + case 'o': + switch(s[3]) { + default: + break; + case 'p': + if (strcmp("erties", s + 4) == 0) + return PIDX_STORE_PARAM_PROPERTIES; + break; + case 'v': + if (strcmp("ider-name", s + 4) == 0) + return PIDX_PROV_PARAM_CORE_PROV_NAME; + } + } + break; + case 'u': + if (strcmp("b", s + 2) == 0) + return PIDX_PKEY_PARAM_PUB_KEY; + break; + case '\0': + return PIDX_PKEY_PARAM_FFC_P; + } + break; + case 'q': + switch(s[1]) { + default: + break; + case '1': + switch(s[2]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_TEST_Q1; + } + break; + case '2': + switch(s[2]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_TEST_Q2; + } + break; + case 'b': + if (strcmp("its", s + 2) == 0) + return PIDX_PKEY_PARAM_FFC_QBITS; + break; + case '\0': + return PIDX_PKEY_PARAM_FFC_Q; + break; + case 'x': + switch(s[2]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_EC_PUB_X; + } + break; + case 'y': + switch(s[2]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_EC_PUB_Y; + } + } + break; + case 'r': + switch(s[1]) { + default: + break; + case 'a': + switch(s[2]) { + default: + break; + case 'n': + switch(s[3]) { + default: + break; + case 'd': + switch(s[4]) { + default: + break; + case 'k': + if (strcmp("ey", s + 5) == 0) + return PIDX_CIPHER_PARAM_RANDOM_KEY; + break; + case 'o': + if (strcmp("m_data", s + 5) == 0) + return PIDX_DRBG_PARAM_RANDOM_DATA; + } + } + } + break; + case 'e': + switch(s[2]) { + default: + break; + case 'a': + switch(s[3]) { + default: + break; + case 'd': + switch(s[4]) { + default: + break; + case '_': + switch(s[5]) { + default: + break; + case 'a': + if (strcmp("head", s + 6) == 0) + return PIDX_LIBSSL_RECORD_LAYER_PARAM_READ_AHEAD; + break; + case 'b': + if (strcmp("uffer_len", s + 6) == 0) + return PIDX_LIBSSL_RECORD_LAYER_READ_BUFFER_LEN; + } + } + } + break; + case 'f': + if (strcmp("erence", s + 3) == 0) + return PIDX_OBJECT_PARAM_REFERENCE; + break; + case 's': + switch(s[3]) { + default: + break; + case 'e': + switch(s[4]) { + default: + break; + case 'e': + switch(s[5]) { + default: + break; + case 'd': + switch(s[6]) { + default: + break; + case '_': + switch(s[7]) { + default: + break; + case 'c': + if (strcmp("ounter", s + 8) == 0) + return PIDX_DRBG_PARAM_RESEED_COUNTER; + break; + case 'r': + if (strcmp("equests", s + 8) == 0) + return PIDX_DRBG_PARAM_RESEED_REQUESTS; + break; + case 't': + switch(s[8]) { + default: + break; + case 'i': + switch(s[9]) { + default: + break; + case 'm': + switch(s[10]) { + default: + break; + case 'e': + switch(s[11]) { + default: + break; + case '_': + if (strcmp("interval", s + 12) == 0) + return PIDX_DRBG_PARAM_RESEED_TIME_INTERVAL; + break; + case '\0': + return PIDX_DRBG_PARAM_RESEED_TIME; + } + } + } + } + } + } + } + } + } + } + break; + case 'o': + if (strcmp("unds", s + 2) == 0) + return PIDX_CIPHER_PARAM_ROUNDS; + break; + case 's': + switch(s[2]) { + default: + break; + case 'a': + switch(s[3]) { + default: + break; + case '-': + switch(s[4]) { + default: + break; + case 'c': + switch(s[5]) { + default: + break; + case 'o': + switch(s[6]) { + default: + break; + case 'e': + switch(s[7]) { + default: + break; + case 'f': + switch(s[8]) { + default: + break; + case 'f': + switch(s[9]) { + default: + break; + case 'i': + switch(s[10]) { + default: + break; + case 'c': + switch(s[11]) { + default: + break; + case 'i': + switch(s[12]) { + default: + break; + case 'e': + switch(s[13]) { + default: + break; + case 'n': + switch(s[14]) { + default: + break; + case 't': + switch(s[15]) { + default: + break; + case '1': + switch(s[16]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_COEFFICIENT1; + } + break; + case '2': + switch(s[16]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_COEFFICIENT2; + } + break; + case '3': + switch(s[16]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_COEFFICIENT3; + } + break; + case '4': + switch(s[16]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_COEFFICIENT4; + } + break; + case '5': + switch(s[16]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_COEFFICIENT5; + } + break; + case '6': + switch(s[16]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_COEFFICIENT6; + } + break; + case '7': + switch(s[16]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_COEFFICIENT7; + } + break; + case '8': + switch(s[16]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_COEFFICIENT8; + } + break; + case '9': + switch(s[16]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_COEFFICIENT9; + } + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_COEFFICIENT; + } + } + } + } + } + } + } + } + } + } + } + break; + case 'd': + if (strcmp("erive-from-pq", s + 5) == 0) + return PIDX_PKEY_PARAM_RSA_DERIVE_FROM_PQ; + break; + case 'e': + switch(s[5]) { + default: + break; + case 'x': + switch(s[6]) { + default: + break; + case 'p': + switch(s[7]) { + default: + break; + case 'o': + switch(s[8]) { + default: + break; + case 'n': + switch(s[9]) { + default: + break; + case 'e': + switch(s[10]) { + default: + break; + case 'n': + switch(s[11]) { + default: + break; + case 't': + switch(s[12]) { + default: + break; + case '1': + switch(s[13]) { + default: + break; + case '0': + switch(s[14]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_EXPONENT10; + } + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_EXPONENT1; + } + break; + case '2': + switch(s[13]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_EXPONENT2; + } + break; + case '3': + switch(s[13]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_EXPONENT3; + } + break; + case '4': + switch(s[13]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_EXPONENT4; + } + break; + case '5': + switch(s[13]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_EXPONENT5; + } + break; + case '6': + switch(s[13]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_EXPONENT6; + } + break; + case '7': + switch(s[13]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_EXPONENT7; + } + break; + case '8': + switch(s[13]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_EXPONENT8; + } + break; + case '9': + switch(s[13]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_EXPONENT9; + } + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_EXPONENT; + } + } + } + } + } + } + } + } + break; + case 'f': + switch(s[5]) { + default: + break; + case 'a': + switch(s[6]) { + default: + break; + case 'c': + switch(s[7]) { + default: + break; + case 't': + switch(s[8]) { + default: + break; + case 'o': + switch(s[9]) { + default: + break; + case 'r': + switch(s[10]) { + default: + break; + case '1': + switch(s[11]) { + default: + break; + case '0': + switch(s[12]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_FACTOR10; + } + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_FACTOR1; + } + break; + case '2': + switch(s[11]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_FACTOR2; + } + break; + case '3': + switch(s[11]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_FACTOR3; + } + break; + case '4': + switch(s[11]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_FACTOR4; + } + break; + case '5': + switch(s[11]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_FACTOR5; + } + break; + case '6': + switch(s[11]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_FACTOR6; + } + break; + case '7': + switch(s[11]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_FACTOR7; + } + break; + case '8': + switch(s[11]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_FACTOR8; + } + break; + case '9': + switch(s[11]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_FACTOR9; + } + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_FACTOR; + } + } + } + } + } + } + break; + case 'p': + switch(s[5]) { + default: + break; + case 'k': + if (strcmp("cs15-pad-disabled", s + 6) == 0) + return PIDX_PROV_PARAM_RSA_PKCS15_PAD_DISABLED; + break; + case 's': + if (strcmp("s-saltlen-check", s + 6) == 0) + return PIDX_SIGNATURE_PARAM_FIPS_RSA_PSS_SALTLEN_CHECK; + } + break; + case 's': + if (strcmp("ign-x931-pad-disabled", s + 5) == 0) + return PIDX_PROV_PARAM_RSA_SIGN_X931_PAD_DISABLED; + } + } + } + break; + case '\0': + return PIDX_KDF_PARAM_SCRYPT_R; + } + break; + case 's': + switch(s[1]) { + default: + break; + case 'a': + switch(s[2]) { + default: + break; + case 'f': + if (strcmp("eprime-generator", s + 3) == 0) + return PIDX_PKEY_PARAM_DH_GENERATOR; + break; + case 'l': + switch(s[3]) { + default: + break; + case 't': + switch(s[4]) { + default: + break; + case 'l': + if (strcmp("en", s + 5) == 0) + return PIDX_SIGNATURE_PARAM_PSS_SALTLEN; + break; + case '\0': + return PIDX_MAC_PARAM_SALT; + } + } + break; + case 'v': + if (strcmp("e-parameters", s + 3) == 0) + return PIDX_ENCODER_PARAM_SAVE_PARAMETERS; + } + break; + case 'e': + switch(s[2]) { + default: + break; + case 'c': + switch(s[3]) { + default: + break; + case 'r': + if (strcmp("et", s + 4) == 0) + return PIDX_KDF_PARAM_SECRET; + break; + case 'u': + switch(s[4]) { + default: + break; + case 'r': + switch(s[5]) { + default: + break; + case 'i': + switch(s[6]) { + default: + break; + case 't': + switch(s[7]) { + default: + break; + case 'y': + switch(s[8]) { + default: + break; + case '-': + switch(s[9]) { + default: + break; + case 'b': + if (strcmp("its", s + 10) == 0) + return PIDX_PKEY_PARAM_SECURITY_BITS; + break; + case 'c': + if (strcmp("hecks", s + 10) == 0) + return PIDX_PROV_PARAM_SECURITY_CHECKS; + } + } + } + } + } + } + } + break; + case 'e': + if (strcmp("d", s + 3) == 0) + return PIDX_PKEY_PARAM_FFC_SEED; + break; + case 'r': + if (strcmp("ial", s + 3) == 0) + return PIDX_STORE_PARAM_SERIAL; + break; + case 's': + if (strcmp("sion_id", s + 3) == 0) + return PIDX_KDF_PARAM_SSHKDF_SESSION_ID; + } + break; + case 'i': + switch(s[2]) { + default: + break; + case 'g': + switch(s[3]) { + default: + break; + case 'n': + switch(s[4]) { + default: + break; + case '-': + switch(s[5]) { + default: + break; + case 'c': + if (strcmp("heck", s + 6) == 0) + return PIDX_PKEY_PARAM_FIPS_SIGN_CHECK; + break; + case 'x': + if (strcmp("931-pad-check", s + 6) == 0) + return PIDX_SIGNATURE_PARAM_FIPS_SIGN_X931_PAD_CHECK; + } + break; + case 'a': + switch(s[5]) { + default: + break; + case 't': + switch(s[6]) { + default: + break; + case 'u': + switch(s[7]) { + default: + break; + case 'r': + switch(s[8]) { + default: + break; + case 'e': + switch(s[9]) { + default: + break; + case '-': + if (strcmp("digest-check", s + 10) == 0) + return PIDX_PROV_PARAM_SIGNATURE_DIGEST_CHECK; + break; + case '\0': + return PIDX_SIGNATURE_PARAM_SIGNATURE; + } + } + } + } + } + } + } + break; + case 'z': + if (strcmp("e", s + 3) == 0) + return PIDX_MAC_PARAM_SIZE; + } + break; + case 'p': + if (strcmp("eed", s + 2) == 0) + return PIDX_CIPHER_PARAM_SPEED; + break; + case 's': + switch(s[2]) { + default: + break; + case 'h': + switch(s[3]) { + default: + break; + case 'k': + switch(s[4]) { + default: + break; + case 'd': + switch(s[5]) { + default: + break; + case 'f': + switch(s[6]) { + default: + break; + case '-': + switch(s[7]) { + default: + break; + case 'd': + if (strcmp("igest-check", s + 8) == 0) + return PIDX_PROV_PARAM_SSHKDF_DIGEST_CHECK; + break; + case 'k': + if (strcmp("ey-check", s + 8) == 0) + return PIDX_PROV_PARAM_SSHKDF_KEY_CHECK; + } + } + } + } + } + break; + case 'k': + switch(s[3]) { + default: + break; + case 'd': + switch(s[4]) { + default: + break; + case 'f': + switch(s[5]) { + default: + break; + case '-': + switch(s[6]) { + default: + break; + case 'd': + if (strcmp("igest-check", s + 7) == 0) + return PIDX_PROV_PARAM_SSKDF_DIGEST_CHECK; + break; + case 'k': + if (strcmp("ey-check", s + 7) == 0) + return PIDX_PROV_PARAM_SSKDF_KEY_CHECK; + } + } + } + } + break; + case 'l': + if (strcmp("3-ms", s + 3) == 0) + return PIDX_DIGEST_PARAM_SSL3_MS; + } + break; + case 't': + switch(s[2]) { + default: + break; + case '-': + switch(s[3]) { + default: + break; + case 'd': + if (strcmp("esc", s + 4) == 0) + return PIDX_PROV_PARAM_SELF_TEST_DESC; + break; + case 'p': + if (strcmp("hase", s + 4) == 0) + return PIDX_PROV_PARAM_SELF_TEST_PHASE; + break; + case 't': + if (strcmp("ype", s + 4) == 0) + return PIDX_PROV_PARAM_SELF_TEST_TYPE; + } + break; + case 'a': + switch(s[3]) { + default: + break; + case 't': + switch(s[4]) { + default: + break; + case 'e': + switch(s[5]) { + default: + break; + case '\0': + return PIDX_RAND_PARAM_STATE; + } + break; + case 'u': + if (strcmp("s", s + 5) == 0) + return PIDX_PROV_PARAM_STATUS; + } + } + break; + case 'r': + switch(s[3]) { + default: + break; + case 'e': + switch(s[4]) { + default: + break; + case 'a': + if (strcmp("m_mac", s + 5) == 0) + return PIDX_LIBSSL_RECORD_LAYER_PARAM_STREAM_MAC; + break; + case 'n': + if (strcmp("gth", s + 5) == 0) + return PIDX_RAND_PARAM_STRENGTH; + } + } + } + break; + case 'u': + switch(s[2]) { + default: + break; + case 'b': + if (strcmp("ject", s + 3) == 0) + return PIDX_STORE_PARAM_SUBJECT; + break; + case 'p': + switch(s[3]) { + default: + break; + case 'p': + switch(s[4]) { + default: + break; + case '-': + switch(s[5]) { + default: + break; + case 'p': + switch(s[6]) { + default: + break; + case 'r': + if (strcmp("ivinfo", s + 7) == 0) + return PIDX_KDF_PARAM_X942_SUPP_PRIVINFO; + break; + case 'u': + if (strcmp("binfo", s + 7) == 0) + return PIDX_KDF_PARAM_X942_SUPP_PUBINFO; + } + } + } + } + } + } + break; + case 't': + switch(s[1]) { + default: + break; + case 'a': + switch(s[2]) { + default: + break; + case 'g': + switch(s[3]) { + default: + break; + case 'l': + if (strcmp("en", s + 4) == 0) + return PIDX_CIPHER_PARAM_AEAD_TAGLEN; + break; + case '\0': + return PIDX_CIPHER_PARAM_AEAD_TAG; + } + } + break; + case 'd': + if (strcmp("es-encrypt-disabled", s + 2) == 0) + return PIDX_PROV_PARAM_TDES_ENCRYPT_DISABLED; + break; + case 'e': + switch(s[2]) { + default: + break; + case 's': + switch(s[3]) { + default: + break; + case 't': + switch(s[4]) { + default: + break; + case '_': + switch(s[5]) { + default: + break; + case 'e': + if (strcmp("ntropy", s + 6) == 0) + return PIDX_RAND_PARAM_TEST_ENTROPY; + break; + case 'n': + if (strcmp("once", s + 6) == 0) + return PIDX_RAND_PARAM_TEST_NONCE; + } + } + } + } + break; + case 'h': + if (strcmp("reads", s + 2) == 0) + return PIDX_KDF_PARAM_THREADS; + break; + case 'l': + switch(s[2]) { + default: + break; + case 's': + switch(s[3]) { + default: + break; + case '-': + switch(s[4]) { + default: + break; + case 'c': + if (strcmp("lient-version", s + 5) == 0) + return PIDX_ASYM_CIPHER_PARAM_TLS_CLIENT_VERSION; + break; + case 'd': + if (strcmp("ata-size", s + 5) == 0) + return PIDX_MAC_PARAM_TLS_DATA_SIZE; + break; + case 'g': + switch(s[5]) { + default: + break; + case 'r': + switch(s[6]) { + default: + break; + case 'o': + switch(s[7]) { + default: + break; + case 'u': + switch(s[8]) { + default: + break; + case 'p': + switch(s[9]) { + default: + break; + case '-': + switch(s[10]) { + default: + break; + case 'a': + if (strcmp("lg", s + 11) == 0) + return PIDX_CAPABILITY_TLS_GROUP_ALG; + break; + case 'i': + switch(s[11]) { + default: + break; + case 'd': + switch(s[12]) { + default: + break; + case '\0': + return PIDX_CAPABILITY_TLS_GROUP_ID; + } + break; + case 's': + if (strcmp("-kem", s + 12) == 0) + return PIDX_CAPABILITY_TLS_GROUP_IS_KEM; + } + break; + case 'n': + switch(s[11]) { + default: + break; + case 'a': + switch(s[12]) { + default: + break; + case 'm': + switch(s[13]) { + default: + break; + case 'e': + switch(s[14]) { + default: + break; + case '-': + if (strcmp("internal", s + 15) == 0) + return PIDX_CAPABILITY_TLS_GROUP_NAME_INTERNAL; + break; + case '\0': + return PIDX_CAPABILITY_TLS_GROUP_NAME; + } + } + } + } + break; + case 's': + if (strcmp("ec-bits", s + 11) == 0) + return PIDX_CAPABILITY_TLS_GROUP_SECURITY_BITS; + } + } + } + } + } + } + break; + case 'm': + switch(s[5]) { + default: + break; + case 'a': + switch(s[6]) { + default: + break; + case 'c': + switch(s[7]) { + default: + break; + case '-': + if (strcmp("size", s + 8) == 0) + return PIDX_CIPHER_PARAM_TLS_MAC_SIZE; + break; + case '\0': + return PIDX_CIPHER_PARAM_TLS_MAC; + } + break; + case 'x': + switch(s[7]) { + default: + break; + case '-': + switch(s[8]) { + default: + break; + case 'd': + if (strcmp("tls", s + 9) == 0) + return PIDX_CAPABILITY_TLS_GROUP_MAX_DTLS; + break; + case 't': + if (strcmp("ls", s + 9) == 0) + return PIDX_CAPABILITY_TLS_SIGALG_MAX_TLS; + } + } + } + break; + case 'i': + switch(s[6]) { + default: + break; + case 'n': + switch(s[7]) { + default: + break; + case '-': + switch(s[8]) { + default: + break; + case 'd': + if (strcmp("tls", s + 9) == 0) + return PIDX_CAPABILITY_TLS_GROUP_MIN_DTLS; + break; + case 't': + if (strcmp("ls", s + 9) == 0) + return PIDX_CAPABILITY_TLS_SIGALG_MIN_TLS; + } + } + } + break; + case 'u': + if (strcmp("lti", s + 6) == 0) + return PIDX_CIPHER_PARAM_TLS1_MULTIBLOCK; + } + break; + case 'n': + if (strcmp("egotiated-version", s + 5) == 0) + return PIDX_ASYM_CIPHER_PARAM_TLS_NEGOTIATED_VERSION; + break; + case 's': + switch(s[5]) { + default: + break; + case 'i': + switch(s[6]) { + default: + break; + case 'g': + switch(s[7]) { + default: + break; + case 'a': + switch(s[8]) { + default: + break; + case 'l': + switch(s[9]) { + default: + break; + case 'g': + switch(s[10]) { + default: + break; + case '-': + switch(s[11]) { + default: + break; + case 'c': + if (strcmp("ode-point", s + 12) == 0) + return PIDX_CAPABILITY_TLS_SIGALG_CODE_POINT; + break; + case 'h': + switch(s[12]) { + default: + break; + case 'a': + switch(s[13]) { + default: + break; + case 's': + switch(s[14]) { + default: + break; + case 'h': + switch(s[15]) { + default: + break; + case '-': + switch(s[16]) { + default: + break; + case 'n': + if (strcmp("ame", s + 17) == 0) + return PIDX_CAPABILITY_TLS_SIGALG_HASH_NAME; + break; + case 'o': + if (strcmp("id", s + 17) == 0) + return PIDX_CAPABILITY_TLS_SIGALG_HASH_OID; + } + } + } + } + } + break; + case 'i': + if (strcmp("ana-name", s + 12) == 0) + return PIDX_CAPABILITY_TLS_SIGALG_IANA_NAME; + break; + case 'k': + switch(s[12]) { + default: + break; + case 'e': + switch(s[13]) { + default: + break; + case 'y': + switch(s[14]) { + default: + break; + case 't': + switch(s[15]) { + default: + break; + case 'y': + switch(s[16]) { + default: + break; + case 'p': + switch(s[17]) { + default: + break; + case 'e': + switch(s[18]) { + default: + break; + case '-': + if (strcmp("oid", s + 19) == 0) + return PIDX_CAPABILITY_TLS_SIGALG_KEYTYPE_OID; + break; + case '\0': + return PIDX_CAPABILITY_TLS_SIGALG_KEYTYPE; + } + } + } + } + } + } + } + break; + case 'n': + if (strcmp("ame", s + 12) == 0) + return PIDX_CAPABILITY_TLS_SIGALG_NAME; + break; + case 'o': + if (strcmp("id", s + 12) == 0) + return PIDX_CAPABILITY_TLS_SIGALG_OID; + break; + case 's': + switch(s[12]) { + default: + break; + case 'e': + if (strcmp("c-bits", s + 13) == 0) + return PIDX_CAPABILITY_TLS_SIGALG_SECURITY_BITS; + break; + case 'i': + switch(s[13]) { + default: + break; + case 'g': + switch(s[14]) { + default: + break; + case '-': + switch(s[15]) { + default: + break; + case 'n': + if (strcmp("ame", s + 16) == 0) + return PIDX_CAPABILITY_TLS_SIGALG_SIG_NAME; + break; + case 'o': + if (strcmp("id", s + 16) == 0) + return PIDX_CAPABILITY_TLS_SIGALG_SIG_OID; + } + } + } + } + } + } + } + } + } + } + } + break; + case 'v': + if (strcmp("ersion", s + 5) == 0) + return PIDX_CIPHER_PARAM_TLS_VERSION; + } + break; + case '1': + switch(s[4]) { + default: + break; + case '-': + switch(s[5]) { + default: + break; + case 'p': + switch(s[6]) { + default: + break; + case 'r': + switch(s[7]) { + default: + break; + case 'f': + switch(s[8]) { + default: + break; + case '-': + switch(s[9]) { + default: + break; + case 'd': + if (strcmp("igest-check", s + 10) == 0) + return PIDX_PROV_PARAM_TLS1_PRF_DIGEST_CHECK; + break; + case 'e': + if (strcmp("ms-check", s + 10) == 0) + return PIDX_PROV_PARAM_TLS1_PRF_EMS_CHECK; + break; + case 'k': + if (strcmp("ey-check", s + 10) == 0) + return PIDX_PROV_PARAM_TLS1_PRF_KEY_CHECK; + } + } + } + } + } + break; + case '3': + switch(s[5]) { + default: + break; + case '-': + switch(s[6]) { + default: + break; + case 'k': + switch(s[7]) { + default: + break; + case 'd': + switch(s[8]) { + default: + break; + case 'f': + switch(s[9]) { + default: + break; + case '-': + switch(s[10]) { + default: + break; + case 'd': + if (strcmp("igest-check", s + 11) == 0) + return PIDX_PROV_PARAM_TLS13_KDF_DIGEST_CHECK; + break; + case 'k': + if (strcmp("ey-check", s + 11) == 0) + return PIDX_PROV_PARAM_TLS13_KDF_KEY_CHECK; + } + } + } + } + } + } + break; + case 'm': + switch(s[5]) { + default: + break; + case 'u': + switch(s[6]) { + default: + break; + case 'l': + switch(s[7]) { + default: + break; + case 't': + switch(s[8]) { + default: + break; + case 'i': + switch(s[9]) { + default: + break; + case '_': + switch(s[10]) { + default: + break; + case 'a': + switch(s[11]) { + default: + break; + case 'a': + switch(s[12]) { + default: + break; + case 'd': + switch(s[13]) { + default: + break; + case 'p': + if (strcmp("acklen", s + 14) == 0) + return PIDX_CIPHER_PARAM_TLS1_MULTIBLOCK_AAD_PACKLEN; + break; + case '\0': + return PIDX_CIPHER_PARAM_TLS1_MULTIBLOCK_AAD; + } + } + } + break; + case 'e': + switch(s[11]) { + default: + break; + case 'n': + switch(s[12]) { + default: + break; + case 'c': + switch(s[13]) { + default: + break; + case 'i': + if (strcmp("n", s + 14) == 0) + return PIDX_CIPHER_PARAM_TLS1_MULTIBLOCK_ENC_IN; + break; + case 'l': + if (strcmp("en", s + 14) == 0) + return PIDX_CIPHER_PARAM_TLS1_MULTIBLOCK_ENC_LEN; + break; + case '\0': + return PIDX_CIPHER_PARAM_TLS1_MULTIBLOCK_ENC; + } + } + } + break; + case 'i': + if (strcmp("nterleave", s + 11) == 0) + return PIDX_CIPHER_PARAM_TLS1_MULTIBLOCK_INTERLEAVE; + break; + case 'm': + switch(s[11]) { + default: + break; + case 'a': + switch(s[12]) { + default: + break; + case 'x': + switch(s[13]) { + default: + break; + case 'b': + if (strcmp("ufsz", s + 14) == 0) + return PIDX_CIPHER_PARAM_TLS1_MULTIBLOCK_MAX_BUFSIZE; + break; + case 's': + if (strcmp("ndfrag", s + 14) == 0) + return PIDX_CIPHER_PARAM_TLS1_MULTIBLOCK_MAX_SEND_FRAGMENT; + } + } + } + } + } + } + } + } + } + } + break; + case 'a': + switch(s[4]) { + default: + break; + case 'a': + switch(s[5]) { + default: + break; + case 'd': + switch(s[6]) { + default: + break; + case 'p': + if (strcmp("ad", s + 7) == 0) + return PIDX_CIPHER_PARAM_AEAD_TLS1_AAD_PAD; + break; + case '\0': + return PIDX_CIPHER_PARAM_AEAD_TLS1_AAD; + } + } + } + break; + case 'i': + switch(s[4]) { + default: + break; + case 'v': + switch(s[5]) { + default: + break; + case 'f': + if (strcmp("ixed", s + 6) == 0) + return PIDX_CIPHER_PARAM_AEAD_TLS1_IV_FIXED; + break; + case 'g': + if (strcmp("en", s + 6) == 0) + return PIDX_CIPHER_PARAM_AEAD_TLS1_GET_IV_GEN; + break; + case 'i': + if (strcmp("nv", s + 6) == 0) + return PIDX_CIPHER_PARAM_AEAD_TLS1_SET_IV_INV; + } + } + break; + case 't': + if (strcmp("ree", s + 4) == 0) + return PIDX_LIBSSL_RECORD_LAYER_PARAM_TLSTREE; + } + } + break; + case 'p': + switch(s[2]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_EC_CHAR2_TP_BASIS; + } + break; + case 'y': + if (strcmp("pe", s + 2) == 0) + return PIDX_PKEY_PARAM_FFC_TYPE; + } + break; + case 'u': + switch(s[1]) { + default: + break; + case 'k': + if (strcmp("m", s + 2) == 0) + return PIDX_KDF_PARAM_UKM; + break; + case 'p': + if (strcmp("dated-iv", s + 2) == 0) + return PIDX_CIPHER_PARAM_UPDATED_IV; + break; + case 's': + switch(s[2]) { + default: + break; + case 'e': + switch(s[3]) { + default: + break; + case '-': + switch(s[4]) { + default: + break; + case 'b': + if (strcmp("its", s + 5) == 0) + return PIDX_CIPHER_PARAM_USE_BITS; + break; + case 'c': + if (strcmp("ofactor-flag", s + 5) == 0) + return PIDX_PKEY_PARAM_USE_COFACTOR_FLAG; + break; + case 'k': + if (strcmp("eybits", s + 5) == 0) + return PIDX_KDF_PARAM_X942_USE_KEYBITS; + break; + case 'l': + switch(s[5]) { + default: + break; + case '\0': + return PIDX_KDF_PARAM_KBKDF_USE_L; + } + break; + case 's': + if (strcmp("eparator", s + 5) == 0) + return PIDX_KDF_PARAM_KBKDF_USE_SEPARATOR; + } + break; + case '_': + switch(s[4]) { + default: + break; + case 'd': + if (strcmp("erivation_function", s + 5) == 0) + return PIDX_DRBG_PARAM_USE_DF; + break; + case 'e': + if (strcmp("tm", s + 5) == 0) + return PIDX_LIBSSL_RECORD_LAYER_PARAM_USE_ETM; + } + } + } + } + break; + case 'v': + switch(s[1]) { + default: + break; + case 'a': + switch(s[2]) { + default: + break; + case 'l': + switch(s[3]) { + default: + break; + case 'i': + switch(s[4]) { + default: + break; + case 'd': + switch(s[5]) { + default: + break; + case 'a': + switch(s[6]) { + default: + break; + case 't': + switch(s[7]) { + default: + break; + case 'e': + switch(s[8]) { + default: + break; + case '-': + switch(s[9]) { + default: + break; + case 'g': + switch(s[10]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_FFC_VALIDATE_G; + } + break; + case 'l': + if (strcmp("egacy", s + 10) == 0) + return PIDX_PKEY_PARAM_FFC_VALIDATE_LEGACY; + break; + case 'p': + if (strcmp("q", s + 10) == 0) + return PIDX_PKEY_PARAM_FFC_VALIDATE_PQ; + } + } + } + } + } + } + } + } + break; + case 'e': + switch(s[2]) { + default: + break; + case 'r': + switch(s[3]) { + default: + break; + case 'i': + if (strcmp("fy-message", s + 4) == 0) + return PIDX_SIGNATURE_PARAM_FIPS_VERIFY_MESSAGE; + break; + case 's': + if (strcmp("ion", s + 4) == 0) + return PIDX_PROV_PARAM_VERSION; + } + } + } + break; + case 'x': + switch(s[1]) { + default: + break; + case '9': + switch(s[2]) { + default: + break; + case '4': + if (strcmp("2kdf-key-check", s + 3) == 0) + return PIDX_PROV_PARAM_X942KDF_KEY_CHECK; + break; + case '6': + switch(s[3]) { + default: + break; + case '3': + switch(s[4]) { + default: + break; + case 'k': + switch(s[5]) { + default: + break; + case 'd': + switch(s[6]) { + default: + break; + case 'f': + switch(s[7]) { + default: + break; + case '-': + switch(s[8]) { + default: + break; + case 'd': + if (strcmp("igest-check", s + 9) == 0) + return PIDX_PROV_PARAM_X963KDF_DIGEST_CHECK; + break; + case 'k': + if (strcmp("ey-check", s + 9) == 0) + return PIDX_PROV_PARAM_X963KDF_KEY_CHECK; + } + } + } + } + } + } + } + break; + case 'c': + if (strcmp("ghash", s + 2) == 0) + return PIDX_KDF_PARAM_SSHKDF_XCGHASH; + break; + case 'o': + switch(s[2]) { + default: + break; + case 'f': + switch(s[3]) { + default: + break; + case 'l': + if (strcmp("en", s + 4) == 0) + return PIDX_DIGEST_PARAM_XOFLEN; + break; + case '\0': + return PIDX_MAC_PARAM_XOF; + } + } + break; + case 'p': + switch(s[2]) { + default: + break; + case '1': + switch(s[3]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_TEST_XP1; + } + break; + case '2': + switch(s[3]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_TEST_XP2; + } + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_TEST_XP; + } + break; + case 'q': + switch(s[2]) { + default: + break; + case '1': + switch(s[3]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_TEST_XQ1; + } + break; + case '2': + switch(s[3]) { + default: + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_TEST_XQ2; + } + break; + case '\0': + return PIDX_PKEY_PARAM_RSA_TEST_XQ; + } + break; + case 't': + if (strcmp("s_standard", s + 2) == 0) + return PIDX_CIPHER_PARAM_XTS_STANDARD; + } + } + return -1; +} + +/* End of TRIE */ diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/internal/param_names.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/internal/param_names.h new file mode 100644 index 000000000000..27bcea8137a2 --- /dev/null +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/internal/param_names.h @@ -0,0 +1,446 @@ +/* + * WARNING: do not edit! + * Generated by Makefile from include/internal/param_names.h.in + * + * Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + + +int ossl_param_find_pidx(const char *s); + +/* Parameter name definitions - generated by util/perl/OpenSSL/paramnames.pm */ +#define NUM_PIDX 329 + +#define PIDX_ALG_PARAM_ALGORITHM_ID 0 +#define PIDX_ALG_PARAM_ALGORITHM_ID_PARAMS 1 +#define PIDX_ALG_PARAM_CIPHER 2 +#define PIDX_ALG_PARAM_DIGEST 3 +#define PIDX_ALG_PARAM_ENGINE 4 +#define PIDX_ALG_PARAM_FIPS_APPROVED_INDICATOR 5 +#define PIDX_ALG_PARAM_MAC 6 +#define PIDX_ALG_PARAM_PROPERTIES 7 +#define PIDX_ASYM_CIPHER_PARAM_DIGEST PIDX_PKEY_PARAM_DIGEST +#define PIDX_ASYM_CIPHER_PARAM_ENGINE PIDX_PKEY_PARAM_ENGINE +#define PIDX_ASYM_CIPHER_PARAM_FIPS_APPROVED_INDICATOR PIDX_ALG_PARAM_FIPS_APPROVED_INDICATOR +#define PIDX_ASYM_CIPHER_PARAM_FIPS_KEY_CHECK PIDX_PKEY_PARAM_FIPS_KEY_CHECK +#define PIDX_ASYM_CIPHER_PARAM_FIPS_RSA_PKCS15_PAD_DISABLED PIDX_PROV_PARAM_RSA_PKCS15_PAD_DISABLED +#define PIDX_ASYM_CIPHER_PARAM_IMPLICIT_REJECTION 8 +#define PIDX_ASYM_CIPHER_PARAM_MGF1_DIGEST PIDX_PKEY_PARAM_MGF1_DIGEST +#define PIDX_ASYM_CIPHER_PARAM_MGF1_DIGEST_PROPS PIDX_PKEY_PARAM_MGF1_PROPERTIES +#define PIDX_ASYM_CIPHER_PARAM_OAEP_DIGEST PIDX_ALG_PARAM_DIGEST +#define PIDX_ASYM_CIPHER_PARAM_OAEP_DIGEST_PROPS 9 +#define PIDX_ASYM_CIPHER_PARAM_OAEP_LABEL 10 +#define PIDX_ASYM_CIPHER_PARAM_PAD_MODE PIDX_PKEY_PARAM_PAD_MODE +#define PIDX_ASYM_CIPHER_PARAM_PROPERTIES PIDX_PKEY_PARAM_PROPERTIES +#define PIDX_ASYM_CIPHER_PARAM_TLS_CLIENT_VERSION 11 +#define PIDX_ASYM_CIPHER_PARAM_TLS_NEGOTIATED_VERSION 12 +#define PIDX_CAPABILITY_TLS_GROUP_ALG 13 +#define PIDX_CAPABILITY_TLS_GROUP_ID 14 +#define PIDX_CAPABILITY_TLS_GROUP_IS_KEM 15 +#define PIDX_CAPABILITY_TLS_GROUP_MAX_DTLS 16 +#define PIDX_CAPABILITY_TLS_GROUP_MAX_TLS 17 +#define PIDX_CAPABILITY_TLS_GROUP_MIN_DTLS 18 +#define PIDX_CAPABILITY_TLS_GROUP_MIN_TLS 19 +#define PIDX_CAPABILITY_TLS_GROUP_NAME 20 +#define PIDX_CAPABILITY_TLS_GROUP_NAME_INTERNAL 21 +#define PIDX_CAPABILITY_TLS_GROUP_SECURITY_BITS 22 +#define PIDX_CAPABILITY_TLS_SIGALG_CODE_POINT 23 +#define PIDX_CAPABILITY_TLS_SIGALG_HASH_NAME 24 +#define PIDX_CAPABILITY_TLS_SIGALG_HASH_OID 25 +#define PIDX_CAPABILITY_TLS_SIGALG_IANA_NAME 26 +#define PIDX_CAPABILITY_TLS_SIGALG_KEYTYPE 27 +#define PIDX_CAPABILITY_TLS_SIGALG_KEYTYPE_OID 28 +#define PIDX_CAPABILITY_TLS_SIGALG_MAX_TLS 17 +#define PIDX_CAPABILITY_TLS_SIGALG_MIN_TLS 19 +#define PIDX_CAPABILITY_TLS_SIGALG_NAME 29 +#define PIDX_CAPABILITY_TLS_SIGALG_OID 30 +#define PIDX_CAPABILITY_TLS_SIGALG_SECURITY_BITS 31 +#define PIDX_CAPABILITY_TLS_SIGALG_SIG_NAME 32 +#define PIDX_CAPABILITY_TLS_SIGALG_SIG_OID 33 +#define PIDX_CIPHER_PARAM_AEAD 34 +#define PIDX_CIPHER_PARAM_AEAD_IVLEN PIDX_CIPHER_PARAM_IVLEN +#define PIDX_CIPHER_PARAM_AEAD_IV_GENERATED 35 +#define PIDX_CIPHER_PARAM_AEAD_MAC_KEY 36 +#define PIDX_CIPHER_PARAM_AEAD_TAG 37 +#define PIDX_CIPHER_PARAM_AEAD_TAGLEN 38 +#define PIDX_CIPHER_PARAM_AEAD_TLS1_AAD 39 +#define PIDX_CIPHER_PARAM_AEAD_TLS1_AAD_PAD 40 +#define PIDX_CIPHER_PARAM_AEAD_TLS1_GET_IV_GEN 41 +#define PIDX_CIPHER_PARAM_AEAD_TLS1_IV_FIXED 42 +#define PIDX_CIPHER_PARAM_AEAD_TLS1_SET_IV_INV 43 +#define PIDX_CIPHER_PARAM_ALGORITHM_ID PIDX_ALG_PARAM_ALGORITHM_ID +#define PIDX_CIPHER_PARAM_ALGORITHM_ID_PARAMS PIDX_ALG_PARAM_ALGORITHM_ID_PARAMS +#define PIDX_CIPHER_PARAM_ALGORITHM_ID_PARAMS_OLD 44 +#define PIDX_CIPHER_PARAM_BLOCK_SIZE 45 +#define PIDX_CIPHER_PARAM_CTS 46 +#define PIDX_CIPHER_PARAM_CTS_MODE 47 +#define PIDX_CIPHER_PARAM_CUSTOM_IV 48 +#define PIDX_CIPHER_PARAM_DECRYPT_ONLY 49 +#define PIDX_CIPHER_PARAM_FIPS_APPROVED_INDICATOR PIDX_ALG_PARAM_FIPS_APPROVED_INDICATOR +#define PIDX_CIPHER_PARAM_FIPS_ENCRYPT_CHECK 50 +#define PIDX_CIPHER_PARAM_HAS_RAND_KEY 51 +#define PIDX_CIPHER_PARAM_IV 52 +#define PIDX_CIPHER_PARAM_IVLEN 53 +#define PIDX_CIPHER_PARAM_KEYLEN 54 +#define PIDX_CIPHER_PARAM_MODE 55 +#define PIDX_CIPHER_PARAM_NUM 56 +#define PIDX_CIPHER_PARAM_PADDING 57 +#define PIDX_CIPHER_PARAM_RANDOM_KEY 58 +#define PIDX_CIPHER_PARAM_RC2_KEYBITS 59 +#define PIDX_CIPHER_PARAM_ROUNDS 60 +#define PIDX_CIPHER_PARAM_SPEED 61 +#define PIDX_CIPHER_PARAM_TLS1_MULTIBLOCK 62 +#define PIDX_CIPHER_PARAM_TLS1_MULTIBLOCK_AAD 63 +#define PIDX_CIPHER_PARAM_TLS1_MULTIBLOCK_AAD_PACKLEN 64 +#define PIDX_CIPHER_PARAM_TLS1_MULTIBLOCK_ENC 65 +#define PIDX_CIPHER_PARAM_TLS1_MULTIBLOCK_ENC_IN 66 +#define PIDX_CIPHER_PARAM_TLS1_MULTIBLOCK_ENC_LEN 67 +#define PIDX_CIPHER_PARAM_TLS1_MULTIBLOCK_INTERLEAVE 68 +#define PIDX_CIPHER_PARAM_TLS1_MULTIBLOCK_MAX_BUFSIZE 69 +#define PIDX_CIPHER_PARAM_TLS1_MULTIBLOCK_MAX_SEND_FRAGMENT 70 +#define PIDX_CIPHER_PARAM_TLS_MAC 71 +#define PIDX_CIPHER_PARAM_TLS_MAC_SIZE 72 +#define PIDX_CIPHER_PARAM_TLS_VERSION 73 +#define PIDX_CIPHER_PARAM_UPDATED_IV 74 +#define PIDX_CIPHER_PARAM_USE_BITS 75 +#define PIDX_CIPHER_PARAM_XTS_STANDARD 76 +#define PIDX_DECODER_PARAM_PROPERTIES PIDX_ALG_PARAM_PROPERTIES +#define PIDX_DIGEST_PARAM_ALGID_ABSENT 77 +#define PIDX_DIGEST_PARAM_BLOCK_SIZE 45 +#define PIDX_DIGEST_PARAM_MICALG 78 +#define PIDX_DIGEST_PARAM_PAD_TYPE 79 +#define PIDX_DIGEST_PARAM_SIZE 80 +#define PIDX_DIGEST_PARAM_SSL3_MS 81 +#define PIDX_DIGEST_PARAM_XOF 82 +#define PIDX_DIGEST_PARAM_XOFLEN 83 +#define PIDX_DRBG_PARAM_CIPHER PIDX_ALG_PARAM_CIPHER +#define PIDX_DRBG_PARAM_DIGEST PIDX_ALG_PARAM_DIGEST +#define PIDX_DRBG_PARAM_ENTROPY_REQUIRED 84 +#define PIDX_DRBG_PARAM_FIPS_APPROVED_INDICATOR PIDX_ALG_PARAM_FIPS_APPROVED_INDICATOR +#define PIDX_DRBG_PARAM_FIPS_DIGEST_CHECK PIDX_PKEY_PARAM_FIPS_DIGEST_CHECK +#define PIDX_DRBG_PARAM_MAC PIDX_ALG_PARAM_MAC +#define PIDX_DRBG_PARAM_MAX_ADINLEN 85 +#define PIDX_DRBG_PARAM_MAX_ENTROPYLEN 86 +#define PIDX_DRBG_PARAM_MAX_LENGTH 87 +#define PIDX_DRBG_PARAM_MAX_NONCELEN 88 +#define PIDX_DRBG_PARAM_MAX_PERSLEN 89 +#define PIDX_DRBG_PARAM_MIN_ENTROPYLEN 90 +#define PIDX_DRBG_PARAM_MIN_LENGTH 91 +#define PIDX_DRBG_PARAM_MIN_NONCELEN 92 +#define PIDX_DRBG_PARAM_PREDICTION_RESISTANCE 93 +#define PIDX_DRBG_PARAM_PROPERTIES PIDX_ALG_PARAM_PROPERTIES +#define PIDX_DRBG_PARAM_RANDOM_DATA 94 +#define PIDX_DRBG_PARAM_RESEED_COUNTER 95 +#define PIDX_DRBG_PARAM_RESEED_REQUESTS 96 +#define PIDX_DRBG_PARAM_RESEED_TIME 97 +#define PIDX_DRBG_PARAM_RESEED_TIME_INTERVAL 98 +#define PIDX_DRBG_PARAM_SIZE 80 +#define PIDX_DRBG_PARAM_USE_DF 99 +#define PIDX_ENCODER_PARAM_CIPHER PIDX_ALG_PARAM_CIPHER +#define PIDX_ENCODER_PARAM_ENCRYPT_LEVEL 100 +#define PIDX_ENCODER_PARAM_PROPERTIES PIDX_ALG_PARAM_PROPERTIES +#define PIDX_ENCODER_PARAM_SAVE_PARAMETERS 101 +#define PIDX_EXCHANGE_PARAM_EC_ECDH_COFACTOR_MODE 102 +#define PIDX_EXCHANGE_PARAM_FIPS_APPROVED_INDICATOR PIDX_ALG_PARAM_FIPS_APPROVED_INDICATOR +#define PIDX_EXCHANGE_PARAM_FIPS_DIGEST_CHECK PIDX_PKEY_PARAM_FIPS_DIGEST_CHECK +#define PIDX_EXCHANGE_PARAM_FIPS_ECDH_COFACTOR_CHECK PIDX_PROV_PARAM_ECDH_COFACTOR_CHECK +#define PIDX_EXCHANGE_PARAM_FIPS_KEY_CHECK PIDX_PKEY_PARAM_FIPS_KEY_CHECK +#define PIDX_EXCHANGE_PARAM_KDF_DIGEST 103 +#define PIDX_EXCHANGE_PARAM_KDF_DIGEST_PROPS 104 +#define PIDX_EXCHANGE_PARAM_KDF_OUTLEN 105 +#define PIDX_EXCHANGE_PARAM_KDF_TYPE 106 +#define PIDX_EXCHANGE_PARAM_KDF_UKM 107 +#define PIDX_EXCHANGE_PARAM_PAD 108 +#define PIDX_GEN_PARAM_ITERATION 109 +#define PIDX_GEN_PARAM_POTENTIAL 110 +#define PIDX_KDF_PARAM_ARGON2_AD 111 +#define PIDX_KDF_PARAM_ARGON2_LANES 112 +#define PIDX_KDF_PARAM_ARGON2_MEMCOST 113 +#define PIDX_KDF_PARAM_ARGON2_VERSION 114 +#define PIDX_KDF_PARAM_CEK_ALG 115 +#define PIDX_KDF_PARAM_CIPHER PIDX_ALG_PARAM_CIPHER +#define PIDX_KDF_PARAM_CONSTANT 116 +#define PIDX_KDF_PARAM_DATA 117 +#define PIDX_KDF_PARAM_DIGEST PIDX_ALG_PARAM_DIGEST +#define PIDX_KDF_PARAM_EARLY_CLEAN 118 +#define PIDX_KDF_PARAM_FIPS_APPROVED_INDICATOR PIDX_ALG_PARAM_FIPS_APPROVED_INDICATOR +#define PIDX_KDF_PARAM_FIPS_DIGEST_CHECK PIDX_PKEY_PARAM_FIPS_DIGEST_CHECK +#define PIDX_KDF_PARAM_FIPS_EMS_CHECK 119 +#define PIDX_KDF_PARAM_FIPS_KEY_CHECK PIDX_PKEY_PARAM_FIPS_KEY_CHECK +#define PIDX_KDF_PARAM_HMACDRBG_ENTROPY 120 +#define PIDX_KDF_PARAM_HMACDRBG_NONCE 121 +#define PIDX_KDF_PARAM_INFO 122 +#define PIDX_KDF_PARAM_ITER 123 +#define PIDX_KDF_PARAM_KBKDF_R 124 +#define PIDX_KDF_PARAM_KBKDF_USE_L 125 +#define PIDX_KDF_PARAM_KBKDF_USE_SEPARATOR 126 +#define PIDX_KDF_PARAM_KEY 127 +#define PIDX_KDF_PARAM_LABEL 128 +#define PIDX_KDF_PARAM_MAC PIDX_ALG_PARAM_MAC +#define PIDX_KDF_PARAM_MAC_SIZE 129 +#define PIDX_KDF_PARAM_MODE 55 +#define PIDX_KDF_PARAM_PASSWORD 130 +#define PIDX_KDF_PARAM_PKCS12_ID 131 +#define PIDX_KDF_PARAM_PKCS5 132 +#define PIDX_KDF_PARAM_PREFIX 133 +#define PIDX_KDF_PARAM_PROPERTIES PIDX_ALG_PARAM_PROPERTIES +#define PIDX_KDF_PARAM_SALT 134 +#define PIDX_KDF_PARAM_SCRYPT_MAXMEM 135 +#define PIDX_KDF_PARAM_SCRYPT_N 136 +#define PIDX_KDF_PARAM_SCRYPT_P 137 +#define PIDX_KDF_PARAM_SCRYPT_R 124 +#define PIDX_KDF_PARAM_SECRET 138 +#define PIDX_KDF_PARAM_SEED 139 +#define PIDX_KDF_PARAM_SIZE 80 +#define PIDX_KDF_PARAM_SSHKDF_SESSION_ID 140 +#define PIDX_KDF_PARAM_SSHKDF_TYPE 141 +#define PIDX_KDF_PARAM_SSHKDF_XCGHASH 142 +#define PIDX_KDF_PARAM_THREADS 143 +#define PIDX_KDF_PARAM_UKM 144 +#define PIDX_KDF_PARAM_X942_ACVPINFO 145 +#define PIDX_KDF_PARAM_X942_PARTYUINFO 146 +#define PIDX_KDF_PARAM_X942_PARTYVINFO 147 +#define PIDX_KDF_PARAM_X942_SUPP_PRIVINFO 148 +#define PIDX_KDF_PARAM_X942_SUPP_PUBINFO 149 +#define PIDX_KDF_PARAM_X942_USE_KEYBITS 150 +#define PIDX_KEM_PARAM_FIPS_APPROVED_INDICATOR PIDX_ALG_PARAM_FIPS_APPROVED_INDICATOR +#define PIDX_KEM_PARAM_FIPS_KEY_CHECK PIDX_PKEY_PARAM_FIPS_KEY_CHECK +#define PIDX_KEM_PARAM_IKME 151 +#define PIDX_KEM_PARAM_OPERATION 152 +#define PIDX_LIBSSL_RECORD_LAYER_PARAM_BLOCK_PADDING 153 +#define PIDX_LIBSSL_RECORD_LAYER_PARAM_HS_PADDING 154 +#define PIDX_LIBSSL_RECORD_LAYER_PARAM_MAX_EARLY_DATA 155 +#define PIDX_LIBSSL_RECORD_LAYER_PARAM_MAX_FRAG_LEN 156 +#define PIDX_LIBSSL_RECORD_LAYER_PARAM_MODE 55 +#define PIDX_LIBSSL_RECORD_LAYER_PARAM_OPTIONS 157 +#define PIDX_LIBSSL_RECORD_LAYER_PARAM_READ_AHEAD 158 +#define PIDX_LIBSSL_RECORD_LAYER_PARAM_STREAM_MAC 159 +#define PIDX_LIBSSL_RECORD_LAYER_PARAM_TLSTREE 160 +#define PIDX_LIBSSL_RECORD_LAYER_PARAM_USE_ETM 161 +#define PIDX_LIBSSL_RECORD_LAYER_READ_BUFFER_LEN 162 +#define PIDX_MAC_PARAM_BLOCK_SIZE 163 +#define PIDX_MAC_PARAM_CIPHER PIDX_ALG_PARAM_CIPHER +#define PIDX_MAC_PARAM_CUSTOM 164 +#define PIDX_MAC_PARAM_C_ROUNDS 165 +#define PIDX_MAC_PARAM_DIGEST PIDX_ALG_PARAM_DIGEST +#define PIDX_MAC_PARAM_DIGEST_NOINIT 166 +#define PIDX_MAC_PARAM_DIGEST_ONESHOT 167 +#define PIDX_MAC_PARAM_D_ROUNDS 168 +#define PIDX_MAC_PARAM_FIPS_APPROVED_INDICATOR PIDX_ALG_PARAM_FIPS_APPROVED_INDICATOR +#define PIDX_MAC_PARAM_FIPS_KEY_CHECK PIDX_PKEY_PARAM_FIPS_KEY_CHECK +#define PIDX_MAC_PARAM_FIPS_NO_SHORT_MAC PIDX_PROV_PARAM_NO_SHORT_MAC +#define PIDX_MAC_PARAM_IV 52 +#define PIDX_MAC_PARAM_KEY 127 +#define PIDX_MAC_PARAM_PROPERTIES PIDX_ALG_PARAM_PROPERTIES +#define PIDX_MAC_PARAM_SALT 134 +#define PIDX_MAC_PARAM_SIZE 80 +#define PIDX_MAC_PARAM_TLS_DATA_SIZE 169 +#define PIDX_MAC_PARAM_XOF 82 +#define PIDX_OBJECT_PARAM_DATA 117 +#define PIDX_OBJECT_PARAM_DATA_STRUCTURE 170 +#define PIDX_OBJECT_PARAM_DATA_TYPE 171 +#define PIDX_OBJECT_PARAM_DESC 172 +#define PIDX_OBJECT_PARAM_REFERENCE 173 +#define PIDX_OBJECT_PARAM_TYPE 141 +#define PIDX_PASSPHRASE_PARAM_INFO 122 +#define PIDX_PKEY_PARAM_ALGORITHM_ID PIDX_ALG_PARAM_ALGORITHM_ID +#define PIDX_PKEY_PARAM_ALGORITHM_ID_PARAMS PIDX_ALG_PARAM_ALGORITHM_ID_PARAMS +#define PIDX_PKEY_PARAM_BITS 174 +#define PIDX_PKEY_PARAM_CIPHER PIDX_ALG_PARAM_CIPHER +#define PIDX_PKEY_PARAM_DEFAULT_DIGEST 175 +#define PIDX_PKEY_PARAM_DHKEM_IKM 176 +#define PIDX_PKEY_PARAM_DH_GENERATOR 177 +#define PIDX_PKEY_PARAM_DH_PRIV_LEN 178 +#define PIDX_PKEY_PARAM_DIGEST PIDX_ALG_PARAM_DIGEST +#define PIDX_PKEY_PARAM_DIGEST_SIZE 179 +#define PIDX_PKEY_PARAM_DIST_ID 180 +#define PIDX_PKEY_PARAM_EC_A 181 +#define PIDX_PKEY_PARAM_EC_B 182 +#define PIDX_PKEY_PARAM_EC_CHAR2_M 183 +#define PIDX_PKEY_PARAM_EC_CHAR2_PP_K1 184 +#define PIDX_PKEY_PARAM_EC_CHAR2_PP_K2 185 +#define PIDX_PKEY_PARAM_EC_CHAR2_PP_K3 186 +#define PIDX_PKEY_PARAM_EC_CHAR2_TP_BASIS 187 +#define PIDX_PKEY_PARAM_EC_CHAR2_TYPE 188 +#define PIDX_PKEY_PARAM_EC_COFACTOR 189 +#define PIDX_PKEY_PARAM_EC_DECODED_FROM_EXPLICIT_PARAMS 190 +#define PIDX_PKEY_PARAM_EC_ENCODING 191 +#define PIDX_PKEY_PARAM_EC_FIELD_TYPE 192 +#define PIDX_PKEY_PARAM_EC_GENERATOR 193 +#define PIDX_PKEY_PARAM_EC_GROUP_CHECK_TYPE 194 +#define PIDX_PKEY_PARAM_EC_INCLUDE_PUBLIC 195 +#define PIDX_PKEY_PARAM_EC_ORDER 196 +#define PIDX_PKEY_PARAM_EC_P 137 +#define PIDX_PKEY_PARAM_EC_POINT_CONVERSION_FORMAT 197 +#define PIDX_PKEY_PARAM_EC_PUB_X 198 +#define PIDX_PKEY_PARAM_EC_PUB_Y 199 +#define PIDX_PKEY_PARAM_EC_SEED 139 +#define PIDX_PKEY_PARAM_ENCODED_PUBLIC_KEY 200 +#define PIDX_PKEY_PARAM_ENGINE PIDX_ALG_PARAM_ENGINE +#define PIDX_PKEY_PARAM_FFC_COFACTOR 201 +#define PIDX_PKEY_PARAM_FFC_DIGEST PIDX_PKEY_PARAM_DIGEST +#define PIDX_PKEY_PARAM_FFC_DIGEST_PROPS PIDX_PKEY_PARAM_PROPERTIES +#define PIDX_PKEY_PARAM_FFC_G 202 +#define PIDX_PKEY_PARAM_FFC_GINDEX 203 +#define PIDX_PKEY_PARAM_FFC_H 204 +#define PIDX_PKEY_PARAM_FFC_P 137 +#define PIDX_PKEY_PARAM_FFC_PBITS 205 +#define PIDX_PKEY_PARAM_FFC_PCOUNTER 206 +#define PIDX_PKEY_PARAM_FFC_Q 207 +#define PIDX_PKEY_PARAM_FFC_QBITS 208 +#define PIDX_PKEY_PARAM_FFC_SEED 139 +#define PIDX_PKEY_PARAM_FFC_TYPE 141 +#define PIDX_PKEY_PARAM_FFC_VALIDATE_G 209 +#define PIDX_PKEY_PARAM_FFC_VALIDATE_LEGACY 210 +#define PIDX_PKEY_PARAM_FFC_VALIDATE_PQ 211 +#define PIDX_PKEY_PARAM_FIPS_APPROVED_INDICATOR PIDX_ALG_PARAM_FIPS_APPROVED_INDICATOR +#define PIDX_PKEY_PARAM_FIPS_DIGEST_CHECK 212 +#define PIDX_PKEY_PARAM_FIPS_KEY_CHECK 213 +#define PIDX_PKEY_PARAM_FIPS_SIGN_CHECK 214 +#define PIDX_PKEY_PARAM_GROUP_NAME 215 +#define PIDX_PKEY_PARAM_IMPLICIT_REJECTION 8 +#define PIDX_PKEY_PARAM_MANDATORY_DIGEST 216 +#define PIDX_PKEY_PARAM_MASKGENFUNC 217 +#define PIDX_PKEY_PARAM_MAX_SIZE 218 +#define PIDX_PKEY_PARAM_MGF1_DIGEST 219 +#define PIDX_PKEY_PARAM_MGF1_PROPERTIES 220 +#define PIDX_PKEY_PARAM_PAD_MODE 221 +#define PIDX_PKEY_PARAM_PRIV_KEY 222 +#define PIDX_PKEY_PARAM_PROPERTIES PIDX_ALG_PARAM_PROPERTIES +#define PIDX_PKEY_PARAM_PUB_KEY 223 +#define PIDX_PKEY_PARAM_RSA_BITS PIDX_PKEY_PARAM_BITS +#define PIDX_PKEY_PARAM_RSA_COEFFICIENT 224 +#define PIDX_PKEY_PARAM_RSA_COEFFICIENT1 225 +#define PIDX_PKEY_PARAM_RSA_COEFFICIENT2 226 +#define PIDX_PKEY_PARAM_RSA_COEFFICIENT3 227 +#define PIDX_PKEY_PARAM_RSA_COEFFICIENT4 228 +#define PIDX_PKEY_PARAM_RSA_COEFFICIENT5 229 +#define PIDX_PKEY_PARAM_RSA_COEFFICIENT6 230 +#define PIDX_PKEY_PARAM_RSA_COEFFICIENT7 231 +#define PIDX_PKEY_PARAM_RSA_COEFFICIENT8 232 +#define PIDX_PKEY_PARAM_RSA_COEFFICIENT9 233 +#define PIDX_PKEY_PARAM_RSA_D 234 +#define PIDX_PKEY_PARAM_RSA_DERIVE_FROM_PQ 235 +#define PIDX_PKEY_PARAM_RSA_DIGEST PIDX_PKEY_PARAM_DIGEST +#define PIDX_PKEY_PARAM_RSA_DIGEST_PROPS PIDX_PKEY_PARAM_PROPERTIES +#define PIDX_PKEY_PARAM_RSA_E 236 +#define PIDX_PKEY_PARAM_RSA_EXPONENT 237 +#define PIDX_PKEY_PARAM_RSA_EXPONENT1 238 +#define PIDX_PKEY_PARAM_RSA_EXPONENT10 239 +#define PIDX_PKEY_PARAM_RSA_EXPONENT2 240 +#define PIDX_PKEY_PARAM_RSA_EXPONENT3 241 +#define PIDX_PKEY_PARAM_RSA_EXPONENT4 242 +#define PIDX_PKEY_PARAM_RSA_EXPONENT5 243 +#define PIDX_PKEY_PARAM_RSA_EXPONENT6 244 +#define PIDX_PKEY_PARAM_RSA_EXPONENT7 245 +#define PIDX_PKEY_PARAM_RSA_EXPONENT8 246 +#define PIDX_PKEY_PARAM_RSA_EXPONENT9 247 +#define PIDX_PKEY_PARAM_RSA_FACTOR 248 +#define PIDX_PKEY_PARAM_RSA_FACTOR1 249 +#define PIDX_PKEY_PARAM_RSA_FACTOR10 250 +#define PIDX_PKEY_PARAM_RSA_FACTOR2 251 +#define PIDX_PKEY_PARAM_RSA_FACTOR3 252 +#define PIDX_PKEY_PARAM_RSA_FACTOR4 253 +#define PIDX_PKEY_PARAM_RSA_FACTOR5 254 +#define PIDX_PKEY_PARAM_RSA_FACTOR6 255 +#define PIDX_PKEY_PARAM_RSA_FACTOR7 256 +#define PIDX_PKEY_PARAM_RSA_FACTOR8 257 +#define PIDX_PKEY_PARAM_RSA_FACTOR9 258 +#define PIDX_PKEY_PARAM_RSA_MASKGENFUNC PIDX_PKEY_PARAM_MASKGENFUNC +#define PIDX_PKEY_PARAM_RSA_MGF1_DIGEST PIDX_PKEY_PARAM_MGF1_DIGEST +#define PIDX_PKEY_PARAM_RSA_N 136 +#define PIDX_PKEY_PARAM_RSA_PRIMES 259 +#define PIDX_PKEY_PARAM_RSA_PSS_SALTLEN 260 +#define PIDX_PKEY_PARAM_RSA_TEST_P1 261 +#define PIDX_PKEY_PARAM_RSA_TEST_P2 262 +#define PIDX_PKEY_PARAM_RSA_TEST_Q1 263 +#define PIDX_PKEY_PARAM_RSA_TEST_Q2 264 +#define PIDX_PKEY_PARAM_RSA_TEST_XP 265 +#define PIDX_PKEY_PARAM_RSA_TEST_XP1 266 +#define PIDX_PKEY_PARAM_RSA_TEST_XP2 267 +#define PIDX_PKEY_PARAM_RSA_TEST_XQ 268 +#define PIDX_PKEY_PARAM_RSA_TEST_XQ1 269 +#define PIDX_PKEY_PARAM_RSA_TEST_XQ2 270 +#define PIDX_PKEY_PARAM_SECURITY_BITS 271 +#define PIDX_PKEY_PARAM_USE_COFACTOR_ECDH PIDX_PKEY_PARAM_USE_COFACTOR_FLAG +#define PIDX_PKEY_PARAM_USE_COFACTOR_FLAG 272 +#define PIDX_PROV_PARAM_BUILDINFO 273 +#define PIDX_PROV_PARAM_CORE_MODULE_FILENAME 274 +#define PIDX_PROV_PARAM_CORE_PROV_NAME 275 +#define PIDX_PROV_PARAM_CORE_VERSION 276 +#define PIDX_PROV_PARAM_DRBG_TRUNC_DIGEST 277 +#define PIDX_PROV_PARAM_DSA_SIGN_DISABLED 278 +#define PIDX_PROV_PARAM_ECDH_COFACTOR_CHECK 279 +#define PIDX_PROV_PARAM_HKDF_DIGEST_CHECK 280 +#define PIDX_PROV_PARAM_HKDF_KEY_CHECK 281 +#define PIDX_PROV_PARAM_HMAC_KEY_CHECK 282 +#define PIDX_PROV_PARAM_KBKDF_KEY_CHECK 283 +#define PIDX_PROV_PARAM_KMAC_KEY_CHECK 284 +#define PIDX_PROV_PARAM_NAME 285 +#define PIDX_PROV_PARAM_NO_SHORT_MAC 286 +#define PIDX_PROV_PARAM_PBKDF2_LOWER_BOUND_CHECK 287 +#define PIDX_PROV_PARAM_RSA_PKCS15_PAD_DISABLED 288 +#define PIDX_PROV_PARAM_RSA_PSS_SALTLEN_CHECK 289 +#define PIDX_PROV_PARAM_RSA_SIGN_X931_PAD_DISABLED 290 +#define PIDX_PROV_PARAM_SECURITY_CHECKS 291 +#define PIDX_PROV_PARAM_SELF_TEST_DESC 292 +#define PIDX_PROV_PARAM_SELF_TEST_PHASE 293 +#define PIDX_PROV_PARAM_SELF_TEST_TYPE 294 +#define PIDX_PROV_PARAM_SIGNATURE_DIGEST_CHECK 295 +#define PIDX_PROV_PARAM_SSHKDF_DIGEST_CHECK 296 +#define PIDX_PROV_PARAM_SSHKDF_KEY_CHECK 297 +#define PIDX_PROV_PARAM_SSKDF_DIGEST_CHECK 298 +#define PIDX_PROV_PARAM_SSKDF_KEY_CHECK 299 +#define PIDX_PROV_PARAM_STATUS 300 +#define PIDX_PROV_PARAM_TDES_ENCRYPT_DISABLED 301 +#define PIDX_PROV_PARAM_TLS13_KDF_DIGEST_CHECK 302 +#define PIDX_PROV_PARAM_TLS13_KDF_KEY_CHECK 303 +#define PIDX_PROV_PARAM_TLS1_PRF_DIGEST_CHECK 304 +#define PIDX_PROV_PARAM_TLS1_PRF_EMS_CHECK 305 +#define PIDX_PROV_PARAM_TLS1_PRF_KEY_CHECK 306 +#define PIDX_PROV_PARAM_VERSION 114 +#define PIDX_PROV_PARAM_X942KDF_KEY_CHECK 307 +#define PIDX_PROV_PARAM_X963KDF_DIGEST_CHECK 308 +#define PIDX_PROV_PARAM_X963KDF_KEY_CHECK 309 +#define PIDX_RAND_PARAM_FIPS_APPROVED_INDICATOR PIDX_ALG_PARAM_FIPS_APPROVED_INDICATOR +#define PIDX_RAND_PARAM_GENERATE 310 +#define PIDX_RAND_PARAM_MAX_REQUEST 311 +#define PIDX_RAND_PARAM_STATE 312 +#define PIDX_RAND_PARAM_STRENGTH 313 +#define PIDX_RAND_PARAM_TEST_ENTROPY 314 +#define PIDX_RAND_PARAM_TEST_NONCE 315 +#define PIDX_SIGNATURE_PARAM_ALGORITHM_ID PIDX_PKEY_PARAM_ALGORITHM_ID +#define PIDX_SIGNATURE_PARAM_ALGORITHM_ID_PARAMS PIDX_PKEY_PARAM_ALGORITHM_ID_PARAMS +#define PIDX_SIGNATURE_PARAM_CONTEXT_STRING 316 +#define PIDX_SIGNATURE_PARAM_DIGEST PIDX_PKEY_PARAM_DIGEST +#define PIDX_SIGNATURE_PARAM_DIGEST_SIZE PIDX_PKEY_PARAM_DIGEST_SIZE +#define PIDX_SIGNATURE_PARAM_FIPS_APPROVED_INDICATOR PIDX_ALG_PARAM_FIPS_APPROVED_INDICATOR +#define PIDX_SIGNATURE_PARAM_FIPS_DIGEST_CHECK PIDX_PKEY_PARAM_FIPS_DIGEST_CHECK +#define PIDX_SIGNATURE_PARAM_FIPS_KEY_CHECK PIDX_PKEY_PARAM_FIPS_KEY_CHECK +#define PIDX_SIGNATURE_PARAM_FIPS_RSA_PSS_SALTLEN_CHECK 289 +#define PIDX_SIGNATURE_PARAM_FIPS_SIGN_CHECK PIDX_PKEY_PARAM_FIPS_SIGN_CHECK +#define PIDX_SIGNATURE_PARAM_FIPS_SIGN_X931_PAD_CHECK 317 +#define PIDX_SIGNATURE_PARAM_FIPS_VERIFY_MESSAGE 318 +#define PIDX_SIGNATURE_PARAM_INSTANCE 319 +#define PIDX_SIGNATURE_PARAM_KAT 320 +#define PIDX_SIGNATURE_PARAM_MGF1_DIGEST PIDX_PKEY_PARAM_MGF1_DIGEST +#define PIDX_SIGNATURE_PARAM_MGF1_PROPERTIES PIDX_PKEY_PARAM_MGF1_PROPERTIES +#define PIDX_SIGNATURE_PARAM_NONCE_TYPE 321 +#define PIDX_SIGNATURE_PARAM_PAD_MODE PIDX_PKEY_PARAM_PAD_MODE +#define PIDX_SIGNATURE_PARAM_PROPERTIES PIDX_PKEY_PARAM_PROPERTIES +#define PIDX_SIGNATURE_PARAM_PSS_SALTLEN 260 +#define PIDX_SIGNATURE_PARAM_SIGNATURE 322 +#define PIDX_STORE_PARAM_ALIAS 323 +#define PIDX_STORE_PARAM_DIGEST 3 +#define PIDX_STORE_PARAM_EXPECT 324 +#define PIDX_STORE_PARAM_FINGERPRINT 325 +#define PIDX_STORE_PARAM_INPUT_TYPE 326 +#define PIDX_STORE_PARAM_ISSUER 285 +#define PIDX_STORE_PARAM_PROPERTIES 7 +#define PIDX_STORE_PARAM_SERIAL 327 +#define PIDX_STORE_PARAM_SUBJECT 328 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/asn1.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/asn1.h index a90152ee0dcd..2425fa10cf34 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/asn1.h +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/asn1.h @@ -2,7 +2,7 @@ * WARNING: do not edit! * Generated by Makefile from include/openssl/asn1.h.in * - * Copyright 1995-2021 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 1995-2023 The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the Apache License 2.0 (the "License"). You may not use * this file except in compliance with the License. You can obtain a copy @@ -21,6 +21,9 @@ # define HEADER_ASN1_H # endif +# ifndef OPENSSL_NO_STDIO +# include +# endif # include # include # include @@ -155,7 +158,7 @@ SKM_DEFINE_STACK_OF_INTERNAL(X509_ALGOR, X509_ALGOR, X509_ALGOR) -# define ASN1_STRING_FLAG_BITS_LEFT 0x08/* Set if 0x07 has bits left value */ +# define ASN1_STRING_FLAG_BITS_LEFT 0x08 /* Set if 0x07 has bits left value */ /* * This indicates that the ASN1_STRING is not a real value but just a place * holder for the location where indefinite length constructed data should be @@ -996,6 +999,8 @@ int ASN1_TYPE_get_int_octetstring(const ASN1_TYPE *a, long *num, unsigned char *data, int max_len); void *ASN1_item_unpack(const ASN1_STRING *oct, const ASN1_ITEM *it); +void *ASN1_item_unpack_ex(const ASN1_STRING *oct, const ASN1_ITEM *it, + OSSL_LIB_CTX *libctx, const char *propq); ASN1_STRING *ASN1_item_pack(void *obj, const ASN1_ITEM *it, ASN1_OCTET_STRING **oct); diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/bio.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/bio.h index 9021fd226825..89ed6c060ddc 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/bio.h +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/bio.h @@ -2,7 +2,7 @@ * WARNING: do not edit! * Generated by Makefile from include/openssl/bio.h.in * - * Copyright 1995-2022 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 1995-2024 The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the Apache License 2.0 (the "License"). You may not use * this file except in compliance with the License. You can obtain a copy @@ -67,8 +67,13 @@ extern "C" { # define BIO_TYPE_DGRAM_SCTP (24|BIO_TYPE_SOURCE_SINK|BIO_TYPE_DESCRIPTOR) # endif # define BIO_TYPE_CORE_TO_PROV (25|BIO_TYPE_SOURCE_SINK) +# define BIO_TYPE_DGRAM_PAIR (26|BIO_TYPE_SOURCE_SINK) +# define BIO_TYPE_DGRAM_MEM (27|BIO_TYPE_SOURCE_SINK) +/* Custom type starting index returned by BIO_get_new_index() */ #define BIO_TYPE_START 128 +/* Custom type maximum index that can be returned by BIO_get_new_index() */ +#define BIO_TYPE_MASK 0xFF /* * BIO_FILENAME_READ|BIO_CLOSE to open or close on free. @@ -171,6 +176,30 @@ extern "C" { # define BIO_CTRL_SET_INDENT 80 # define BIO_CTRL_GET_INDENT 81 +# define BIO_CTRL_DGRAM_GET_LOCAL_ADDR_CAP 82 +# define BIO_CTRL_DGRAM_GET_LOCAL_ADDR_ENABLE 83 +# define BIO_CTRL_DGRAM_SET_LOCAL_ADDR_ENABLE 84 +# define BIO_CTRL_DGRAM_GET_EFFECTIVE_CAPS 85 +# define BIO_CTRL_DGRAM_GET_CAPS 86 +# define BIO_CTRL_DGRAM_SET_CAPS 87 +# define BIO_CTRL_DGRAM_GET_NO_TRUNC 88 +# define BIO_CTRL_DGRAM_SET_NO_TRUNC 89 + +/* + * internal BIO: + * # define BIO_CTRL_SET_KTLS_TX_ZEROCOPY_SENDFILE 90 + */ + +# define BIO_CTRL_GET_RPOLL_DESCRIPTOR 91 +# define BIO_CTRL_GET_WPOLL_DESCRIPTOR 92 +# define BIO_CTRL_DGRAM_DETECT_PEER_ADDR 93 + +# define BIO_DGRAM_CAP_NONE 0U +# define BIO_DGRAM_CAP_HANDLES_SRC_ADDR (1U << 0) +# define BIO_DGRAM_CAP_HANDLES_DST_ADDR (1U << 1) +# define BIO_DGRAM_CAP_PROVIDES_SRC_ADDR (1U << 2) +# define BIO_DGRAM_CAP_PROVIDES_DST_ADDR (1U << 3) + # ifndef OPENSSL_NO_KTLS # define BIO_get_ktls_send(b) \ (BIO_ctrl(b, BIO_CTRL_GET_KTLS_SEND, 0, NULL) > 0) @@ -208,7 +237,7 @@ extern "C" { # define BIO_FLAGS_NONCLEAR_RST 0x400 # define BIO_FLAGS_IN_EOF 0x800 -/* the BIO FLAGS values 0x1000 to 0x4000 are reserved for internal KTLS flags */ +/* the BIO FLAGS values 0x1000 to 0x8000 are reserved for internal KTLS flags */ typedef union bio_addr_st BIO_ADDR; typedef struct bio_addrinfo_st BIO_ADDRINFO; @@ -256,12 +285,14 @@ void BIO_clear_flags(BIO *b, int flags); # define BIO_RR_ACCEPT 0x03 /* These are passed by the BIO callback */ -# define BIO_CB_FREE 0x01 -# define BIO_CB_READ 0x02 -# define BIO_CB_WRITE 0x03 -# define BIO_CB_PUTS 0x04 -# define BIO_CB_GETS 0x05 -# define BIO_CB_CTRL 0x06 +# define BIO_CB_FREE 0x01 +# define BIO_CB_READ 0x02 +# define BIO_CB_WRITE 0x03 +# define BIO_CB_PUTS 0x04 +# define BIO_CB_GETS 0x05 +# define BIO_CB_CTRL 0x06 +# define BIO_CB_RECVMMSG 0x07 +# define BIO_CB_SENDMMSG 0x08 /* * The callback is called before and after the underling operation, The @@ -362,6 +393,36 @@ struct bio_dgram_sctp_prinfo { }; # endif +/* BIO_sendmmsg/BIO_recvmmsg-related definitions */ +typedef struct bio_msg_st { + void *data; + size_t data_len; + BIO_ADDR *peer, *local; + uint64_t flags; +} BIO_MSG; + +typedef struct bio_mmsg_cb_args_st { + BIO_MSG *msg; + size_t stride, num_msg; + uint64_t flags; + size_t *msgs_processed; +} BIO_MMSG_CB_ARGS; + +#define BIO_POLL_DESCRIPTOR_TYPE_NONE 0 +#define BIO_POLL_DESCRIPTOR_TYPE_SOCK_FD 1 +#define BIO_POLL_DESCRIPTOR_TYPE_SSL 2 +#define BIO_POLL_DESCRIPTOR_CUSTOM_START 8192 + +typedef struct bio_poll_descriptor_st { + uint32_t type; + union { + int fd; + void *custom; + uintptr_t custom_ui; + SSL *ssl; + } value; +} BIO_POLL_DESCRIPTOR; + /* * #define BIO_CONN_get_param_hostname BIO_ctrl */ @@ -428,10 +489,17 @@ struct bio_dgram_sctp_prinfo { # define BIO_C_SET_CONNECT_MODE 155 +# define BIO_C_SET_TFO 156 /* like BIO_C_SET_NBIO */ + +# define BIO_C_SET_SOCK_TYPE 157 +# define BIO_C_GET_SOCK_TYPE 158 +# define BIO_C_GET_DGRAM_BIO 159 + # define BIO_set_app_data(s,arg) BIO_set_ex_data(s,0,arg) # define BIO_get_app_data(s) BIO_get_ex_data(s,0) -# define BIO_set_nbio(b,n) BIO_ctrl(b,BIO_C_SET_NBIO,(n),NULL) +# define BIO_set_nbio(b,n) BIO_ctrl(b,BIO_C_SET_NBIO,(n),NULL) +# define BIO_set_tfo(b,n) BIO_ctrl(b,BIO_C_SET_TFO,(n),NULL) # ifndef OPENSSL_NO_SOCK /* IP families we support, for BIO_s_connect() and BIO_s_accept() */ @@ -452,7 +520,11 @@ struct bio_dgram_sctp_prinfo { # define BIO_get_conn_port(b) ((const char *)BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,1)) # define BIO_get_conn_address(b) ((const BIO_ADDR *)BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,2)) # define BIO_get_conn_ip_family(b) BIO_ctrl(b,BIO_C_GET_CONNECT,3,NULL) +# define BIO_get_conn_mode(b) BIO_ctrl(b,BIO_C_GET_CONNECT,4,NULL) # define BIO_set_conn_mode(b,n) BIO_ctrl(b,BIO_C_SET_CONNECT_MODE,(n),NULL) +# define BIO_set_sock_type(b,t) BIO_ctrl(b,BIO_C_SET_SOCK_TYPE,(t),NULL) +# define BIO_get_sock_type(b) BIO_ctrl(b,BIO_C_GET_SOCK_TYPE,0,NULL) +# define BIO_get0_dgram_bio(b, p) BIO_ctrl(b,BIO_C_GET_DGRAM_BIO,0,(void *)(BIO **)(p)) /* BIO_s_accept() */ # define BIO_set_accept_name(b,name) BIO_ctrl(b,BIO_C_SET_ACCEPT,0, \ @@ -469,6 +541,7 @@ struct bio_dgram_sctp_prinfo { (char *)(bio)) # define BIO_set_accept_ip_family(b,f) BIO_int_ctrl(b,BIO_C_SET_ACCEPT,4,f) # define BIO_get_accept_ip_family(b) BIO_ctrl(b,BIO_C_GET_ACCEPT,4,NULL) +# define BIO_set_tfo_accept(b,n) BIO_ctrl(b,BIO_C_SET_ACCEPT,5,(n)?(void *)"a":NULL) /* Aliases kept for backward compatibility */ # define BIO_BIND_NORMAL 0 @@ -596,8 +669,30 @@ int BIO_ctrl_reset_read_request(BIO *b); (int)BIO_ctrl(b, BIO_CTRL_DGRAM_GET_PEER, 0, (char *)(peer)) # define BIO_dgram_set_peer(b,peer) \ (int)BIO_ctrl(b, BIO_CTRL_DGRAM_SET_PEER, 0, (char *)(peer)) +# define BIO_dgram_detect_peer_addr(b,peer) \ + (int)BIO_ctrl(b, BIO_CTRL_DGRAM_DETECT_PEER_ADDR, 0, (char *)(peer)) # define BIO_dgram_get_mtu_overhead(b) \ (unsigned int)BIO_ctrl((b), BIO_CTRL_DGRAM_GET_MTU_OVERHEAD, 0, NULL) +# define BIO_dgram_get_local_addr_cap(b) \ + (int)BIO_ctrl((b), BIO_CTRL_DGRAM_GET_LOCAL_ADDR_CAP, 0, NULL) +# define BIO_dgram_get_local_addr_enable(b, penable) \ + (int)BIO_ctrl((b), BIO_CTRL_DGRAM_GET_LOCAL_ADDR_ENABLE, 0, (char *)(penable)) +# define BIO_dgram_set_local_addr_enable(b, enable) \ + (int)BIO_ctrl((b), BIO_CTRL_DGRAM_SET_LOCAL_ADDR_ENABLE, (enable), NULL) +# define BIO_dgram_get_effective_caps(b) \ + (uint32_t)BIO_ctrl((b), BIO_CTRL_DGRAM_GET_EFFECTIVE_CAPS, 0, NULL) +# define BIO_dgram_get_caps(b) \ + (uint32_t)BIO_ctrl((b), BIO_CTRL_DGRAM_GET_CAPS, 0, NULL) +# define BIO_dgram_set_caps(b, caps) \ + (int)BIO_ctrl((b), BIO_CTRL_DGRAM_SET_CAPS, (long)(caps), NULL) +# define BIO_dgram_get_no_trunc(b) \ + (unsigned int)BIO_ctrl((b), BIO_CTRL_DGRAM_GET_NO_TRUNC, 0, NULL) +# define BIO_dgram_set_no_trunc(b, enable) \ + (int)BIO_ctrl((b), BIO_CTRL_DGRAM_SET_NO_TRUNC, (enable), NULL) +# define BIO_dgram_get_mtu(b) \ + (unsigned int)BIO_ctrl((b), BIO_CTRL_DGRAM_GET_MTU, 0, NULL) +# define BIO_dgram_set_mtu(b, mtu) \ + (int)BIO_ctrl((b), BIO_CTRL_DGRAM_SET_MTU, (mtu), NULL) /* ctrl macros for BIO_f_prefix */ # define BIO_set_prefix(b,p) BIO_ctrl((b), BIO_CTRL_SET_PREFIX, 0, (void *)(p)) @@ -640,10 +735,18 @@ void BIO_vfree(BIO *a); int BIO_up_ref(BIO *a); int BIO_read(BIO *b, void *data, int dlen); int BIO_read_ex(BIO *b, void *data, size_t dlen, size_t *readbytes); +__owur int BIO_recvmmsg(BIO *b, BIO_MSG *msg, + size_t stride, size_t num_msg, uint64_t flags, + size_t *msgs_processed); int BIO_gets(BIO *bp, char *buf, int size); int BIO_get_line(BIO *bio, char *buf, int size); int BIO_write(BIO *b, const void *data, int dlen); int BIO_write_ex(BIO *b, const void *data, size_t dlen, size_t *written); +__owur int BIO_sendmmsg(BIO *b, BIO_MSG *msg, + size_t stride, size_t num_msg, uint64_t flags, + size_t *msgs_processed); +__owur int BIO_get_rpoll_descriptor(BIO *b, BIO_POLL_DESCRIPTOR *desc); +__owur int BIO_get_wpoll_descriptor(BIO *b, BIO_POLL_DESCRIPTOR *desc); int BIO_puts(BIO *bp, const char *buf); int BIO_indent(BIO *b, int indent, int max); long BIO_ctrl(BIO *bp, int cmd, long larg, void *parg); @@ -667,6 +770,9 @@ int BIO_nwrite0(BIO *bio, char **buf); int BIO_nwrite(BIO *bio, char **buf, int num); const BIO_METHOD *BIO_s_mem(void); +# ifndef OPENSSL_NO_DGRAM +const BIO_METHOD *BIO_s_dgram_mem(void); +# endif const BIO_METHOD *BIO_s_secmem(void); BIO *BIO_new_mem_buf(const void *buf, int len); # ifndef OPENSSL_NO_SOCK @@ -686,6 +792,7 @@ const BIO_METHOD *BIO_f_nbio_test(void); const BIO_METHOD *BIO_f_prefix(void); const BIO_METHOD *BIO_s_core(void); # ifndef OPENSSL_NO_DGRAM +const BIO_METHOD *BIO_s_dgram_pair(void); const BIO_METHOD *BIO_s_datagram(void); int BIO_dgram_non_fatal_error(int error); BIO *BIO_new_dgram(int fd, int close_flag); @@ -704,6 +811,7 @@ int BIO_dgram_sctp_msg_waiting(BIO *b); # ifndef OPENSSL_NO_SOCK int BIO_sock_should_retry(int i); int BIO_sock_non_fatal_error(int error); +int BIO_err_is_non_fatal(unsigned int errcode); int BIO_socket_wait(int fd, int for_read, time_t max_time); # endif int BIO_wait(BIO *bio, time_t max_time, unsigned int nap_milliseconds); @@ -726,6 +834,8 @@ int BIO_hex_string(BIO *out, int indent, int width, const void *data, # ifndef OPENSSL_NO_SOCK BIO_ADDR *BIO_ADDR_new(void); +int BIO_ADDR_copy(BIO_ADDR *dst, const BIO_ADDR *src); +BIO_ADDR *BIO_ADDR_dup(const BIO_ADDR *ap); int BIO_ADDR_rawmake(BIO_ADDR *ap, int family, const void *where, size_t wherelen, unsigned short port); void BIO_ADDR_free(BIO_ADDR *); @@ -788,6 +898,7 @@ int BIO_sock_info(int sock, # define BIO_SOCK_KEEPALIVE 0x04 # define BIO_SOCK_NONBLOCK 0x08 # define BIO_SOCK_NODELAY 0x10 +# define BIO_SOCK_TFO 0x20 int BIO_socket(int domain, int socktype, int protocol, int options); int BIO_connect(int sock, const BIO_ADDR *addr, int options); @@ -805,6 +916,11 @@ BIO *BIO_new_fd(int fd, int close_flag); int BIO_new_bio_pair(BIO **bio1, size_t writebuf1, BIO **bio2, size_t writebuf2); +# ifndef OPENSSL_NO_DGRAM +int BIO_new_bio_dgram_pair(BIO **bio1, size_t writebuf1, + BIO **bio2, size_t writebuf2); +# endif + /* * If successful, returns 1 and in *bio1, *bio2 two BIO pair endpoints. * Otherwise returns 0 and sets *bio1 and *bio2 to NULL. Size 0 uses default @@ -856,12 +972,24 @@ int BIO_meth_set_write(BIO_METHOD *biom, int (*write) (BIO *, const char *, int)); int BIO_meth_set_write_ex(BIO_METHOD *biom, int (*bwrite) (BIO *, const char *, size_t, size_t *)); +int BIO_meth_set_sendmmsg(BIO_METHOD *biom, + int (*f) (BIO *, BIO_MSG *, size_t, size_t, + uint64_t, size_t *)); +int (*BIO_meth_get_sendmmsg(const BIO_METHOD *biom))(BIO *, BIO_MSG *, + size_t, size_t, + uint64_t, size_t *); int (*BIO_meth_get_read(const BIO_METHOD *biom)) (BIO *, char *, int); int (*BIO_meth_get_read_ex(const BIO_METHOD *biom)) (BIO *, char *, size_t, size_t *); int BIO_meth_set_read(BIO_METHOD *biom, int (*read) (BIO *, char *, int)); int BIO_meth_set_read_ex(BIO_METHOD *biom, int (*bread) (BIO *, char *, size_t, size_t *)); +int BIO_meth_set_recvmmsg(BIO_METHOD *biom, + int (*f) (BIO *, BIO_MSG *, size_t, size_t, + uint64_t, size_t *)); +int (*BIO_meth_get_recvmmsg(const BIO_METHOD *biom))(BIO *, BIO_MSG *, + size_t, size_t, + uint64_t, size_t *); int (*BIO_meth_get_puts(const BIO_METHOD *biom)) (BIO *, const char *); int BIO_meth_set_puts(BIO_METHOD *biom, int (*puts) (BIO *, const char *)); diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/cmp.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/cmp.h index 11e842d9a321..284398b237bb 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/cmp.h +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/cmp.h @@ -2,7 +2,7 @@ * WARNING: do not edit! * Generated by Makefile from include/openssl/cmp.h.in * - * Copyright 2007-2023 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 2007-2024 The OpenSSL Project Authors. All Rights Reserved. * Copyright Nokia 2007-2019 * Copyright Siemens AG 2015-2019 * @@ -35,7 +35,9 @@ extern "C" { # endif -# define OSSL_CMP_PVNO 2 +# define OSSL_CMP_PVNO_2 2 +# define OSSL_CMP_PVNO_3 3 +# define OSSL_CMP_PVNO OSSL_CMP_PVNO_2 /* v2 is the default */ /*- * PKIFailureInfo ::= BIT STRING { @@ -137,7 +139,6 @@ extern "C" { # if OSSL_CMP_PKIFAILUREINFO_MAX_BIT_PATTERN > INT_MAX # error CMP_PKIFAILUREINFO_MAX bit pattern does not fit in type int # endif - typedef ASN1_BIT_STRING OSSL_CMP_PKIFAILUREINFO; # define OSSL_CMP_CTX_FAILINFO_badAlg (1 << 0) @@ -203,8 +204,8 @@ typedef ASN1_BIT_STRING OSSL_CMP_PKIFAILUREINFO; # define OSSL_CMP_PKISTATUS_revocationWarning 4 # define OSSL_CMP_PKISTATUS_revocationNotification 5 # define OSSL_CMP_PKISTATUS_keyUpdateWarning 6 - typedef ASN1_INTEGER OSSL_CMP_PKISTATUS; + DECLARE_ASN1_ITEM(OSSL_CMP_PKISTATUS) # define OSSL_CMP_CERTORENCCERT_CERTIFICATE 0 @@ -274,6 +275,46 @@ SKM_DEFINE_STACK_OF_INTERNAL(OSSL_CMP_ITAV, OSSL_CMP_ITAV, OSSL_CMP_ITAV) #define sk_OSSL_CMP_ITAV_deep_copy(sk, copyfunc, freefunc) ((STACK_OF(OSSL_CMP_ITAV) *)OPENSSL_sk_deep_copy(ossl_check_const_OSSL_CMP_ITAV_sk_type(sk), ossl_check_OSSL_CMP_ITAV_copyfunc_type(copyfunc), ossl_check_OSSL_CMP_ITAV_freefunc_type(freefunc))) #define sk_OSSL_CMP_ITAV_set_cmp_func(sk, cmp) ((sk_OSSL_CMP_ITAV_compfunc)OPENSSL_sk_set_cmp_func(ossl_check_OSSL_CMP_ITAV_sk_type(sk), ossl_check_OSSL_CMP_ITAV_compfunc_type(cmp))) + +typedef struct ossl_cmp_crlstatus_st OSSL_CMP_CRLSTATUS; +SKM_DEFINE_STACK_OF_INTERNAL(OSSL_CMP_CRLSTATUS, OSSL_CMP_CRLSTATUS, OSSL_CMP_CRLSTATUS) +#define sk_OSSL_CMP_CRLSTATUS_num(sk) OPENSSL_sk_num(ossl_check_const_OSSL_CMP_CRLSTATUS_sk_type(sk)) +#define sk_OSSL_CMP_CRLSTATUS_value(sk, idx) ((OSSL_CMP_CRLSTATUS *)OPENSSL_sk_value(ossl_check_const_OSSL_CMP_CRLSTATUS_sk_type(sk), (idx))) +#define sk_OSSL_CMP_CRLSTATUS_new(cmp) ((STACK_OF(OSSL_CMP_CRLSTATUS) *)OPENSSL_sk_new(ossl_check_OSSL_CMP_CRLSTATUS_compfunc_type(cmp))) +#define sk_OSSL_CMP_CRLSTATUS_new_null() ((STACK_OF(OSSL_CMP_CRLSTATUS) *)OPENSSL_sk_new_null()) +#define sk_OSSL_CMP_CRLSTATUS_new_reserve(cmp, n) ((STACK_OF(OSSL_CMP_CRLSTATUS) *)OPENSSL_sk_new_reserve(ossl_check_OSSL_CMP_CRLSTATUS_compfunc_type(cmp), (n))) +#define sk_OSSL_CMP_CRLSTATUS_reserve(sk, n) OPENSSL_sk_reserve(ossl_check_OSSL_CMP_CRLSTATUS_sk_type(sk), (n)) +#define sk_OSSL_CMP_CRLSTATUS_free(sk) OPENSSL_sk_free(ossl_check_OSSL_CMP_CRLSTATUS_sk_type(sk)) +#define sk_OSSL_CMP_CRLSTATUS_zero(sk) OPENSSL_sk_zero(ossl_check_OSSL_CMP_CRLSTATUS_sk_type(sk)) +#define sk_OSSL_CMP_CRLSTATUS_delete(sk, i) ((OSSL_CMP_CRLSTATUS *)OPENSSL_sk_delete(ossl_check_OSSL_CMP_CRLSTATUS_sk_type(sk), (i))) +#define sk_OSSL_CMP_CRLSTATUS_delete_ptr(sk, ptr) ((OSSL_CMP_CRLSTATUS *)OPENSSL_sk_delete_ptr(ossl_check_OSSL_CMP_CRLSTATUS_sk_type(sk), ossl_check_OSSL_CMP_CRLSTATUS_type(ptr))) +#define sk_OSSL_CMP_CRLSTATUS_push(sk, ptr) OPENSSL_sk_push(ossl_check_OSSL_CMP_CRLSTATUS_sk_type(sk), ossl_check_OSSL_CMP_CRLSTATUS_type(ptr)) +#define sk_OSSL_CMP_CRLSTATUS_unshift(sk, ptr) OPENSSL_sk_unshift(ossl_check_OSSL_CMP_CRLSTATUS_sk_type(sk), ossl_check_OSSL_CMP_CRLSTATUS_type(ptr)) +#define sk_OSSL_CMP_CRLSTATUS_pop(sk) ((OSSL_CMP_CRLSTATUS *)OPENSSL_sk_pop(ossl_check_OSSL_CMP_CRLSTATUS_sk_type(sk))) +#define sk_OSSL_CMP_CRLSTATUS_shift(sk) ((OSSL_CMP_CRLSTATUS *)OPENSSL_sk_shift(ossl_check_OSSL_CMP_CRLSTATUS_sk_type(sk))) +#define sk_OSSL_CMP_CRLSTATUS_pop_free(sk, freefunc) OPENSSL_sk_pop_free(ossl_check_OSSL_CMP_CRLSTATUS_sk_type(sk),ossl_check_OSSL_CMP_CRLSTATUS_freefunc_type(freefunc)) +#define sk_OSSL_CMP_CRLSTATUS_insert(sk, ptr, idx) OPENSSL_sk_insert(ossl_check_OSSL_CMP_CRLSTATUS_sk_type(sk), ossl_check_OSSL_CMP_CRLSTATUS_type(ptr), (idx)) +#define sk_OSSL_CMP_CRLSTATUS_set(sk, idx, ptr) ((OSSL_CMP_CRLSTATUS *)OPENSSL_sk_set(ossl_check_OSSL_CMP_CRLSTATUS_sk_type(sk), (idx), ossl_check_OSSL_CMP_CRLSTATUS_type(ptr))) +#define sk_OSSL_CMP_CRLSTATUS_find(sk, ptr) OPENSSL_sk_find(ossl_check_OSSL_CMP_CRLSTATUS_sk_type(sk), ossl_check_OSSL_CMP_CRLSTATUS_type(ptr)) +#define sk_OSSL_CMP_CRLSTATUS_find_ex(sk, ptr) OPENSSL_sk_find_ex(ossl_check_OSSL_CMP_CRLSTATUS_sk_type(sk), ossl_check_OSSL_CMP_CRLSTATUS_type(ptr)) +#define sk_OSSL_CMP_CRLSTATUS_find_all(sk, ptr, pnum) OPENSSL_sk_find_all(ossl_check_OSSL_CMP_CRLSTATUS_sk_type(sk), ossl_check_OSSL_CMP_CRLSTATUS_type(ptr), pnum) +#define sk_OSSL_CMP_CRLSTATUS_sort(sk) OPENSSL_sk_sort(ossl_check_OSSL_CMP_CRLSTATUS_sk_type(sk)) +#define sk_OSSL_CMP_CRLSTATUS_is_sorted(sk) OPENSSL_sk_is_sorted(ossl_check_const_OSSL_CMP_CRLSTATUS_sk_type(sk)) +#define sk_OSSL_CMP_CRLSTATUS_dup(sk) ((STACK_OF(OSSL_CMP_CRLSTATUS) *)OPENSSL_sk_dup(ossl_check_const_OSSL_CMP_CRLSTATUS_sk_type(sk))) +#define sk_OSSL_CMP_CRLSTATUS_deep_copy(sk, copyfunc, freefunc) ((STACK_OF(OSSL_CMP_CRLSTATUS) *)OPENSSL_sk_deep_copy(ossl_check_const_OSSL_CMP_CRLSTATUS_sk_type(sk), ossl_check_OSSL_CMP_CRLSTATUS_copyfunc_type(copyfunc), ossl_check_OSSL_CMP_CRLSTATUS_freefunc_type(freefunc))) +#define sk_OSSL_CMP_CRLSTATUS_set_cmp_func(sk, cmp) ((sk_OSSL_CMP_CRLSTATUS_compfunc)OPENSSL_sk_set_cmp_func(ossl_check_OSSL_CMP_CRLSTATUS_sk_type(sk), ossl_check_OSSL_CMP_CRLSTATUS_compfunc_type(cmp))) + + +typedef OSSL_CRMF_ATTRIBUTETYPEANDVALUE OSSL_CMP_ATAV; +# define OSSL_CMP_ATAV_free OSSL_CRMF_ATTRIBUTETYPEANDVALUE_free +typedef STACK_OF(OSSL_CRMF_ATTRIBUTETYPEANDVALUE) OSSL_CMP_ATAVS; +DECLARE_ASN1_FUNCTIONS(OSSL_CMP_ATAVS) +# define stack_st_OSSL_CMP_ATAV stack_st_OSSL_CRMF_ATTRIBUTETYPEANDVALUE +# define sk_OSSL_CMP_ATAV_num sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_num +# define sk_OSSL_CMP_ATAV_value sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_value +# define sk_OSSL_CMP_ATAV_push sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_push +# define sk_OSSL_CMP_ATAV_pop_free sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_pop_free + typedef struct ossl_cmp_revrepcontent_st OSSL_CMP_REVREPCONTENT; typedef struct ossl_cmp_pkisi_st OSSL_CMP_PKISI; DECLARE_ASN1_FUNCTIONS(OSSL_CMP_PKISI) @@ -375,21 +416,75 @@ void OSSL_CMP_ITAV_set0(OSSL_CMP_ITAV *itav, ASN1_OBJECT *type, ASN1_TYPE *value); ASN1_OBJECT *OSSL_CMP_ITAV_get0_type(const OSSL_CMP_ITAV *itav); ASN1_TYPE *OSSL_CMP_ITAV_get0_value(const OSSL_CMP_ITAV *itav); -int OSSL_CMP_ITAV_push0_stack_item(STACK_OF(OSSL_CMP_ITAV) **itav_sk_p, +int OSSL_CMP_ITAV_push0_stack_item(STACK_OF(OSSL_CMP_ITAV) **sk_p, OSSL_CMP_ITAV *itav); void OSSL_CMP_ITAV_free(OSSL_CMP_ITAV *itav); + +OSSL_CMP_ITAV *OSSL_CMP_ITAV_new0_certProfile(STACK_OF(ASN1_UTF8STRING) + *certProfile); +int OSSL_CMP_ITAV_get0_certProfile(const OSSL_CMP_ITAV *itav, + STACK_OF(ASN1_UTF8STRING) **out); +OSSL_CMP_ITAV *OSSL_CMP_ITAV_new_caCerts(const STACK_OF(X509) *caCerts); +int OSSL_CMP_ITAV_get0_caCerts(const OSSL_CMP_ITAV *itav, STACK_OF(X509) **out); + +OSSL_CMP_ITAV *OSSL_CMP_ITAV_new_rootCaCert(const X509 *rootCaCert); +int OSSL_CMP_ITAV_get0_rootCaCert(const OSSL_CMP_ITAV *itav, X509 **out); +OSSL_CMP_ITAV *OSSL_CMP_ITAV_new_rootCaKeyUpdate(const X509 *newWithNew, + const X509 *newWithOld, + const X509 *oldWithNew); +int OSSL_CMP_ITAV_get0_rootCaKeyUpdate(const OSSL_CMP_ITAV *itav, + X509 **newWithNew, + X509 **newWithOld, + X509 **oldWithNew); + +OSSL_CMP_CRLSTATUS *OSSL_CMP_CRLSTATUS_create(const X509_CRL *crl, + const X509 *cert, int only_DN); +OSSL_CMP_CRLSTATUS *OSSL_CMP_CRLSTATUS_new1(const DIST_POINT_NAME *dpn, + const GENERAL_NAMES *issuer, + const ASN1_TIME *thisUpdate); +int OSSL_CMP_CRLSTATUS_get0(const OSSL_CMP_CRLSTATUS *crlstatus, + DIST_POINT_NAME **dpn, GENERAL_NAMES **issuer, + ASN1_TIME **thisUpdate); +void OSSL_CMP_CRLSTATUS_free(OSSL_CMP_CRLSTATUS *crlstatus); +OSSL_CMP_ITAV +*OSSL_CMP_ITAV_new0_crlStatusList(STACK_OF(OSSL_CMP_CRLSTATUS) *crlStatusList); +int OSSL_CMP_ITAV_get0_crlStatusList(const OSSL_CMP_ITAV *itav, + STACK_OF(OSSL_CMP_CRLSTATUS) **out); +OSSL_CMP_ITAV *OSSL_CMP_ITAV_new_crls(const X509_CRL *crls); +int OSSL_CMP_ITAV_get0_crls(const OSSL_CMP_ITAV *it, STACK_OF(X509_CRL) **out); +OSSL_CMP_ITAV +*OSSL_CMP_ITAV_new0_certReqTemplate(OSSL_CRMF_CERTTEMPLATE *certTemplate, + OSSL_CMP_ATAVS *keySpec); +int OSSL_CMP_ITAV_get1_certReqTemplate(const OSSL_CMP_ITAV *itav, + OSSL_CRMF_CERTTEMPLATE **certTemplate, + OSSL_CMP_ATAVS **keySpec); + +OSSL_CMP_ATAV *OSSL_CMP_ATAV_create(ASN1_OBJECT *type, ASN1_TYPE *value); +void OSSL_CMP_ATAV_set0(OSSL_CMP_ATAV *itav, ASN1_OBJECT *type, + ASN1_TYPE *value); +ASN1_OBJECT *OSSL_CMP_ATAV_get0_type(const OSSL_CMP_ATAV *itav); +ASN1_TYPE *OSSL_CMP_ATAV_get0_value(const OSSL_CMP_ATAV *itav); +OSSL_CMP_ATAV *OSSL_CMP_ATAV_new_algId(const X509_ALGOR *alg); +X509_ALGOR *OSSL_CMP_ATAV_get0_algId(const OSSL_CMP_ATAV *atav); +OSSL_CMP_ATAV *OSSL_CMP_ATAV_new_rsaKeyLen(int len); +int OSSL_CMP_ATAV_get_rsaKeyLen(const OSSL_CMP_ATAV *atav); +int OSSL_CMP_ATAV_push1(OSSL_CMP_ATAVS **sk_p, const OSSL_CMP_ATAV *atav); + void OSSL_CMP_MSG_free(OSSL_CMP_MSG *msg); /* from cmp_ctx.c */ OSSL_CMP_CTX *OSSL_CMP_CTX_new(OSSL_LIB_CTX *libctx, const char *propq); void OSSL_CMP_CTX_free(OSSL_CMP_CTX *ctx); int OSSL_CMP_CTX_reinit(OSSL_CMP_CTX *ctx); +OSSL_LIB_CTX *OSSL_CMP_CTX_get0_libctx(const OSSL_CMP_CTX *ctx); +const char *OSSL_CMP_CTX_get0_propq(const OSSL_CMP_CTX *ctx); /* CMP general options: */ # define OSSL_CMP_OPT_LOG_VERBOSITY 0 /* CMP transfer options: */ -# define OSSL_CMP_OPT_KEEP_ALIVE 10 -# define OSSL_CMP_OPT_MSG_TIMEOUT 11 +# define OSSL_CMP_OPT_KEEP_ALIVE 10 +# define OSSL_CMP_OPT_MSG_TIMEOUT 11 # define OSSL_CMP_OPT_TOTAL_TIMEOUT 12 +# define OSSL_CMP_OPT_USE_TLS 13 /* CMP request options: */ # define OSSL_CMP_OPT_VALIDITY_DAYS 20 # define OSSL_CMP_OPT_SUBJECTALTNAME_NODEFAULT 21 @@ -407,6 +502,7 @@ int OSSL_CMP_CTX_reinit(OSSL_CMP_CTX *ctx); # define OSSL_CMP_OPT_DIGEST_ALGNID 34 # define OSSL_CMP_OPT_IGNORE_KEYUSAGE 35 # define OSSL_CMP_OPT_PERMIT_TA_IN_EXTRACERTS_FOR_IR 36 +# define OSSL_CMP_OPT_NO_CACHE_EXTRACERTS 37 int OSSL_CMP_CTX_set_option(OSSL_CMP_CTX *ctx, int opt, int val); int OSSL_CMP_CTX_get_option(const OSSL_CMP_CTX *ctx, int opt); /* CMP-specific callback for logging and outputting the error queue: */ @@ -420,9 +516,11 @@ int OSSL_CMP_CTX_set1_server(OSSL_CMP_CTX *ctx, const char *address); int OSSL_CMP_CTX_set_serverPort(OSSL_CMP_CTX *ctx, int port); int OSSL_CMP_CTX_set1_proxy(OSSL_CMP_CTX *ctx, const char *name); int OSSL_CMP_CTX_set1_no_proxy(OSSL_CMP_CTX *ctx, const char *names); +# ifndef OPENSSL_NO_HTTP int OSSL_CMP_CTX_set_http_cb(OSSL_CMP_CTX *ctx, OSSL_HTTP_bio_cb_t cb); int OSSL_CMP_CTX_set_http_cb_arg(OSSL_CMP_CTX *ctx, void *arg); void *OSSL_CMP_CTX_get_http_cb_arg(const OSSL_CMP_CTX *ctx); +# endif typedef OSSL_CMP_MSG *(*OSSL_CMP_transfer_cb_t) (OSSL_CMP_CTX *ctx, const OSSL_CMP_MSG *req); int OSSL_CMP_CTX_set_transfer_cb(OSSL_CMP_CTX *ctx, OSSL_CMP_transfer_cb_t cb); @@ -432,7 +530,9 @@ void *OSSL_CMP_CTX_get_transfer_cb_arg(const OSSL_CMP_CTX *ctx); int OSSL_CMP_CTX_set1_srvCert(OSSL_CMP_CTX *ctx, X509 *cert); int OSSL_CMP_CTX_set1_expected_sender(OSSL_CMP_CTX *ctx, const X509_NAME *name); int OSSL_CMP_CTX_set0_trustedStore(OSSL_CMP_CTX *ctx, X509_STORE *store); +# define OSSL_CMP_CTX_set0_trusted OSSL_CMP_CTX_set0_trustedStore X509_STORE *OSSL_CMP_CTX_get0_trustedStore(const OSSL_CMP_CTX *ctx); +# define OSSL_CMP_CTX_get0_trusted OSSL_CMP_CTX_get0_trustedStore int OSSL_CMP_CTX_set1_untrusted(OSSL_CMP_CTX *ctx, STACK_OF(X509) *certs); STACK_OF(X509) *OSSL_CMP_CTX_get0_untrusted(const OSSL_CMP_CTX *ctx); /* client authentication: */ @@ -448,12 +548,15 @@ int OSSL_CMP_CTX_set1_secretValue(OSSL_CMP_CTX *ctx, int OSSL_CMP_CTX_set1_recipient(OSSL_CMP_CTX *ctx, const X509_NAME *name); int OSSL_CMP_CTX_push0_geninfo_ITAV(OSSL_CMP_CTX *ctx, OSSL_CMP_ITAV *itav); int OSSL_CMP_CTX_reset_geninfo_ITAVs(OSSL_CMP_CTX *ctx); +STACK_OF(OSSL_CMP_ITAV) + *OSSL_CMP_CTX_get0_geninfo_ITAVs(const OSSL_CMP_CTX *ctx); int OSSL_CMP_CTX_set1_extraCertsOut(OSSL_CMP_CTX *ctx, STACK_OF(X509) *extraCertsOut); /* certificate template: */ int OSSL_CMP_CTX_set0_newPkey(OSSL_CMP_CTX *ctx, int priv, EVP_PKEY *pkey); EVP_PKEY *OSSL_CMP_CTX_get0_newPkey(const OSSL_CMP_CTX *ctx, int priv); int OSSL_CMP_CTX_set1_issuer(OSSL_CMP_CTX *ctx, const X509_NAME *name); +int OSSL_CMP_CTX_set1_serialNumber(OSSL_CMP_CTX *ctx, const ASN1_INTEGER *sn); int OSSL_CMP_CTX_set1_subjectName(OSSL_CMP_CTX *ctx, const X509_NAME *name); int OSSL_CMP_CTX_push1_subjectAltName(OSSL_CMP_CTX *ctx, const GENERAL_NAME *name); @@ -477,6 +580,7 @@ int OSSL_CMP_CTX_get_status(const OSSL_CMP_CTX *ctx); OSSL_CMP_PKIFREETEXT *OSSL_CMP_CTX_get0_statusString(const OSSL_CMP_CTX *ctx); int OSSL_CMP_CTX_get_failInfoCode(const OSSL_CMP_CTX *ctx); # define OSSL_CMP_PKISI_BUFLEN 1024 +X509 *OSSL_CMP_CTX_get0_validatedSrvCert(const OSSL_CMP_CTX *ctx); X509 *OSSL_CMP_CTX_get0_newCert(const OSSL_CMP_CTX *ctx); STACK_OF(X509) *OSSL_CMP_CTX_get1_newChain(const OSSL_CMP_CTX *ctx); STACK_OF(X509) *OSSL_CMP_CTX_get1_caPubs(const OSSL_CMP_CTX *ctx); @@ -498,10 +602,13 @@ OSSL_CMP_STATUSINFO_new(int status, int fail_info, const char *text); ASN1_OCTET_STRING *OSSL_CMP_HDR_get0_transactionID(const OSSL_CMP_PKIHEADER *hdr); ASN1_OCTET_STRING *OSSL_CMP_HDR_get0_recipNonce(const OSSL_CMP_PKIHEADER *hdr); +STACK_OF(OSSL_CMP_ITAV) + *OSSL_CMP_HDR_get0_geninfo_ITAVs(const OSSL_CMP_PKIHEADER *hdr); /* from cmp_msg.c */ OSSL_CMP_PKIHEADER *OSSL_CMP_MSG_get0_header(const OSSL_CMP_MSG *msg); int OSSL_CMP_MSG_get_bodytype(const OSSL_CMP_MSG *msg); +X509_PUBKEY *OSSL_CMP_MSG_get0_certreq_publickey(const OSSL_CMP_MSG *msg); int OSSL_CMP_MSG_update_transactionID(OSSL_CMP_CTX *ctx, OSSL_CMP_MSG *msg); int OSSL_CMP_MSG_update_recipNonce(OSSL_CMP_CTX *ctx, OSSL_CMP_MSG *msg); OSSL_CRMF_MSG *OSSL_CMP_CTX_setup_CRM(OSSL_CMP_CTX *ctx, int for_KUR, int rid); @@ -517,8 +624,10 @@ int OSSL_CMP_validate_cert_path(const OSSL_CMP_CTX *ctx, X509_STORE *trusted_store, X509 *cert); /* from cmp_http.c */ +# ifndef OPENSSL_NO_HTTP OSSL_CMP_MSG *OSSL_CMP_MSG_http_perform(OSSL_CMP_CTX *ctx, const OSSL_CMP_MSG *req); +# endif /* from cmp_server.c */ typedef struct ossl_cmp_srv_ctx_st OSSL_CMP_SRV_CTX; @@ -561,6 +670,13 @@ int OSSL_CMP_SRV_CTX_init(OSSL_CMP_SRV_CTX *srv_ctx, void *custom_ctx, OSSL_CMP_SRV_error_cb_t process_error, OSSL_CMP_SRV_certConf_cb_t process_certConf, OSSL_CMP_SRV_pollReq_cb_t process_pollReq); +typedef int (*OSSL_CMP_SRV_delayed_delivery_cb_t)(OSSL_CMP_SRV_CTX *srv_ctx, + const OSSL_CMP_MSG *req); +typedef int (*OSSL_CMP_SRV_clean_transaction_cb_t)(OSSL_CMP_SRV_CTX *srv_ctx, + const ASN1_OCTET_STRING *id); +int OSSL_CMP_SRV_CTX_init_trans(OSSL_CMP_SRV_CTX *srv_ctx, + OSSL_CMP_SRV_delayed_delivery_cb_t delay, + OSSL_CMP_SRV_clean_transaction_cb_t clean); OSSL_CMP_CTX *OSSL_CMP_SRV_CTX_get0_cmp_ctx(const OSSL_CMP_SRV_CTX *srv_ctx); void *OSSL_CMP_SRV_CTX_get0_custom_ctx(const OSSL_CMP_SRV_CTX *srv_ctx); int OSSL_CMP_SRV_CTX_set_send_unprotected_errors(OSSL_CMP_SRV_CTX *srv_ctx, @@ -577,6 +693,8 @@ X509 *OSSL_CMP_exec_certreq(OSSL_CMP_CTX *ctx, int req_type, # define OSSL_CMP_CR 2 # define OSSL_CMP_P10CR 4 # define OSSL_CMP_KUR 7 +# define OSSL_CMP_GENM 21 +# define OSSL_CMP_ERROR 23 # define OSSL_CMP_exec_IR_ses(ctx) \ OSSL_CMP_exec_certreq(ctx, OSSL_CMP_IR, NULL) # define OSSL_CMP_exec_CR_ses(ctx) \ @@ -590,6 +708,18 @@ int OSSL_CMP_try_certreq(OSSL_CMP_CTX *ctx, int req_type, int OSSL_CMP_exec_RR_ses(OSSL_CMP_CTX *ctx); STACK_OF(OSSL_CMP_ITAV) *OSSL_CMP_exec_GENM_ses(OSSL_CMP_CTX *ctx); +/* from cmp_genm.c */ +int OSSL_CMP_get1_caCerts(OSSL_CMP_CTX *ctx, STACK_OF(X509) **out); +int OSSL_CMP_get1_rootCaKeyUpdate(OSSL_CMP_CTX *ctx, + const X509 *oldWithOld, X509 **newWithNew, + X509 **newWithOld, X509 **oldWithNew); +int OSSL_CMP_get1_crlUpdate(OSSL_CMP_CTX *ctx, const X509 *crlcert, + const X509_CRL *last_crl, + X509_CRL **crl); +int OSSL_CMP_get1_certReqTemplate(OSSL_CMP_CTX *ctx, + OSSL_CRMF_CERTTEMPLATE **certTemplate, + OSSL_CMP_ATAVS **keySpec); + # ifdef __cplusplus } # endif diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/cms.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/cms.h index 38567efe86ce..0f21a5193020 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/cms.h +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/cms.h @@ -2,7 +2,7 @@ * WARNING: do not edit! * Generated by Makefile from include/openssl/cms.h.in * - * Copyright 2008-2021 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 2008-2022 The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the Apache License 2.0 (the "License"). You may not use * this file except in compliance with the License. You can obtain a copy @@ -31,8 +31,10 @@ extern "C" { # endif +typedef struct CMS_EnvelopedData_st CMS_EnvelopedData; typedef struct CMS_ContentInfo_st CMS_ContentInfo; typedef struct CMS_SignerInfo_st CMS_SignerInfo; +typedef struct CMS_SignedData_st CMS_SignedData; typedef struct CMS_CertificateChoices CMS_CertificateChoices; typedef struct CMS_RevocationInfoChoice_st CMS_RevocationInfoChoice; typedef struct CMS_RecipientInfo_st CMS_RecipientInfo; @@ -147,6 +149,8 @@ SKM_DEFINE_STACK_OF_INTERNAL(CMS_RevocationInfoChoice, CMS_RevocationInfoChoice, #define sk_CMS_RevocationInfoChoice_set_cmp_func(sk, cmp) ((sk_CMS_RevocationInfoChoice_compfunc)OPENSSL_sk_set_cmp_func(ossl_check_CMS_RevocationInfoChoice_sk_type(sk), ossl_check_CMS_RevocationInfoChoice_compfunc_type(cmp))) +DECLARE_ASN1_ITEM(CMS_EnvelopedData) +DECLARE_ASN1_ALLOC_FUNCTIONS(CMS_SignedData) DECLARE_ASN1_FUNCTIONS(CMS_ContentInfo) DECLARE_ASN1_FUNCTIONS(CMS_ReceiptRequest) DECLARE_ASN1_PRINT_FUNCTION(CMS_ContentInfo) @@ -217,13 +221,16 @@ int SMIME_write_CMS(BIO *bio, CMS_ContentInfo *cms, BIO *data, int flags); int CMS_final(CMS_ContentInfo *cms, BIO *data, BIO *dcont, unsigned int flags); +int CMS_final_digest(CMS_ContentInfo *cms, + const unsigned char *md, unsigned int mdlen, BIO *dcont, + unsigned int flags); CMS_ContentInfo *CMS_sign(X509 *signcert, EVP_PKEY *pkey, STACK_OF(X509) *certs, BIO *data, unsigned int flags); CMS_ContentInfo *CMS_sign_ex(X509 *signcert, EVP_PKEY *pkey, STACK_OF(X509) *certs, BIO *data, - unsigned int flags, OSSL_LIB_CTX *ctx, + unsigned int flags, OSSL_LIB_CTX *libctx, const char *propq); CMS_ContentInfo *CMS_sign_receipt(CMS_SignerInfo *si, @@ -233,27 +240,26 @@ CMS_ContentInfo *CMS_sign_receipt(CMS_SignerInfo *si, int CMS_data(CMS_ContentInfo *cms, BIO *out, unsigned int flags); CMS_ContentInfo *CMS_data_create(BIO *in, unsigned int flags); CMS_ContentInfo *CMS_data_create_ex(BIO *in, unsigned int flags, - OSSL_LIB_CTX *ctx, const char *propq); + OSSL_LIB_CTX *libctx, const char *propq); int CMS_digest_verify(CMS_ContentInfo *cms, BIO *dcont, BIO *out, unsigned int flags); CMS_ContentInfo *CMS_digest_create(BIO *in, const EVP_MD *md, unsigned int flags); CMS_ContentInfo *CMS_digest_create_ex(BIO *in, const EVP_MD *md, - unsigned int flags, OSSL_LIB_CTX *ctx, + unsigned int flags, OSSL_LIB_CTX *libctx, const char *propq); int CMS_EncryptedData_decrypt(CMS_ContentInfo *cms, const unsigned char *key, size_t keylen, BIO *dcont, BIO *out, unsigned int flags); - CMS_ContentInfo *CMS_EncryptedData_encrypt(BIO *in, const EVP_CIPHER *cipher, const unsigned char *key, size_t keylen, unsigned int flags); CMS_ContentInfo *CMS_EncryptedData_encrypt_ex(BIO *in, const EVP_CIPHER *cipher, const unsigned char *key, size_t keylen, unsigned int flags, - OSSL_LIB_CTX *ctx, + OSSL_LIB_CTX *libctx, const char *propq); int CMS_EncryptedData_set1_key(CMS_ContentInfo *cms, const EVP_CIPHER *ciph, @@ -272,7 +278,7 @@ CMS_ContentInfo *CMS_encrypt(STACK_OF(X509) *certs, BIO *in, const EVP_CIPHER *cipher, unsigned int flags); CMS_ContentInfo *CMS_encrypt_ex(STACK_OF(X509) *certs, BIO *in, const EVP_CIPHER *cipher, unsigned int flags, - OSSL_LIB_CTX *ctx, const char *propq); + OSSL_LIB_CTX *libctx, const char *propq); int CMS_decrypt(CMS_ContentInfo *cms, EVP_PKEY *pkey, X509 *cert, BIO *dcont, BIO *out, unsigned int flags); @@ -291,12 +297,16 @@ int CMS_RecipientInfo_type(CMS_RecipientInfo *ri); EVP_PKEY_CTX *CMS_RecipientInfo_get0_pkey_ctx(CMS_RecipientInfo *ri); CMS_ContentInfo *CMS_AuthEnvelopedData_create(const EVP_CIPHER *cipher); CMS_ContentInfo * -CMS_AuthEnvelopedData_create_ex(const EVP_CIPHER *cipher, OSSL_LIB_CTX *ctx, +CMS_AuthEnvelopedData_create_ex(const EVP_CIPHER *cipher, OSSL_LIB_CTX *libctx, const char *propq); CMS_ContentInfo *CMS_EnvelopedData_create(const EVP_CIPHER *cipher); CMS_ContentInfo *CMS_EnvelopedData_create_ex(const EVP_CIPHER *cipher, - OSSL_LIB_CTX *ctx, + OSSL_LIB_CTX *libctx, const char *propq); +BIO *CMS_EnvelopedData_decrypt(CMS_EnvelopedData *env, BIO *detached_data, + EVP_PKEY *pkey, X509 *cert, + ASN1_OCTET_STRING *secret, unsigned int flags, + OSSL_LIB_CTX *libctx, const char *propq); CMS_RecipientInfo *CMS_add1_recipient_cert(CMS_ContentInfo *cms, X509 *recip, unsigned int flags); @@ -385,6 +395,11 @@ ASN1_OCTET_STRING *CMS_SignerInfo_get0_signature(CMS_SignerInfo *si); int CMS_SignerInfo_sign(CMS_SignerInfo *si); int CMS_SignerInfo_verify(CMS_SignerInfo *si); int CMS_SignerInfo_verify_content(CMS_SignerInfo *si, BIO *chain); +BIO *CMS_SignedData_verify(CMS_SignedData *sd, BIO *detached_data, + STACK_OF(X509) *scerts, X509_STORE *store, + STACK_OF(X509) *extra, STACK_OF(X509_CRL) *crls, + unsigned int flags, + OSSL_LIB_CTX *libctx, const char *propq); int CMS_add_smimecap(CMS_SignerInfo *si, STACK_OF(X509_ALGOR) *algs); int CMS_add_simple_smimecap(STACK_OF(X509_ALGOR) **algs, @@ -441,7 +456,7 @@ CMS_ReceiptRequest *CMS_ReceiptRequest_create0_ex( unsigned char *id, int idlen, int allorfirst, STACK_OF(GENERAL_NAMES) *receiptList, STACK_OF(GENERAL_NAMES) *receiptsTo, - OSSL_LIB_CTX *ctx); + OSSL_LIB_CTX *libctx); int CMS_add1_ReceiptRequest(CMS_SignerInfo *si, CMS_ReceiptRequest *rr); void CMS_ReceiptRequest_get0_values(CMS_ReceiptRequest *rr, diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/comp.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/comp.h new file mode 100644 index 000000000000..1aa062f19250 --- /dev/null +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/comp.h @@ -0,0 +1,98 @@ +/* + * Copyright 2015-2024 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + + + +#ifndef OPENSSL_COMP_H +# define OPENSSL_COMP_H +# pragma once + +# include +# ifndef OPENSSL_NO_DEPRECATED_3_0 +# define HEADER_COMP_H +# endif + +# include + +# include +# include +# ifdef __cplusplus +extern "C" { +# endif + + + +# ifndef OPENSSL_NO_COMP + +COMP_CTX *COMP_CTX_new(COMP_METHOD *meth); +const COMP_METHOD *COMP_CTX_get_method(const COMP_CTX *ctx); +int COMP_CTX_get_type(const COMP_CTX* comp); +int COMP_get_type(const COMP_METHOD *meth); +const char *COMP_get_name(const COMP_METHOD *meth); +void COMP_CTX_free(COMP_CTX *ctx); + +int COMP_compress_block(COMP_CTX *ctx, unsigned char *out, int olen, + unsigned char *in, int ilen); +int COMP_expand_block(COMP_CTX *ctx, unsigned char *out, int olen, + unsigned char *in, int ilen); + +COMP_METHOD *COMP_zlib(void); +COMP_METHOD *COMP_zlib_oneshot(void); +COMP_METHOD *COMP_brotli(void); +COMP_METHOD *COMP_brotli_oneshot(void); +COMP_METHOD *COMP_zstd(void); +COMP_METHOD *COMP_zstd_oneshot(void); + +# ifndef OPENSSL_NO_DEPRECATED_1_1_0 +# define COMP_zlib_cleanup() while(0) continue +# endif + +# ifdef OPENSSL_BIO_H +const BIO_METHOD *BIO_f_zlib(void); +const BIO_METHOD *BIO_f_brotli(void); +const BIO_METHOD *BIO_f_zstd(void); +# endif + +# endif + +typedef struct ssl_comp_st SSL_COMP; + +SKM_DEFINE_STACK_OF_INTERNAL(SSL_COMP, SSL_COMP, SSL_COMP) +#define sk_SSL_COMP_num(sk) OPENSSL_sk_num(ossl_check_const_SSL_COMP_sk_type(sk)) +#define sk_SSL_COMP_value(sk, idx) ((SSL_COMP *)OPENSSL_sk_value(ossl_check_const_SSL_COMP_sk_type(sk), (idx))) +#define sk_SSL_COMP_new(cmp) ((STACK_OF(SSL_COMP) *)OPENSSL_sk_new(ossl_check_SSL_COMP_compfunc_type(cmp))) +#define sk_SSL_COMP_new_null() ((STACK_OF(SSL_COMP) *)OPENSSL_sk_new_null()) +#define sk_SSL_COMP_new_reserve(cmp, n) ((STACK_OF(SSL_COMP) *)OPENSSL_sk_new_reserve(ossl_check_SSL_COMP_compfunc_type(cmp), (n))) +#define sk_SSL_COMP_reserve(sk, n) OPENSSL_sk_reserve(ossl_check_SSL_COMP_sk_type(sk), (n)) +#define sk_SSL_COMP_free(sk) OPENSSL_sk_free(ossl_check_SSL_COMP_sk_type(sk)) +#define sk_SSL_COMP_zero(sk) OPENSSL_sk_zero(ossl_check_SSL_COMP_sk_type(sk)) +#define sk_SSL_COMP_delete(sk, i) ((SSL_COMP *)OPENSSL_sk_delete(ossl_check_SSL_COMP_sk_type(sk), (i))) +#define sk_SSL_COMP_delete_ptr(sk, ptr) ((SSL_COMP *)OPENSSL_sk_delete_ptr(ossl_check_SSL_COMP_sk_type(sk), ossl_check_SSL_COMP_type(ptr))) +#define sk_SSL_COMP_push(sk, ptr) OPENSSL_sk_push(ossl_check_SSL_COMP_sk_type(sk), ossl_check_SSL_COMP_type(ptr)) +#define sk_SSL_COMP_unshift(sk, ptr) OPENSSL_sk_unshift(ossl_check_SSL_COMP_sk_type(sk), ossl_check_SSL_COMP_type(ptr)) +#define sk_SSL_COMP_pop(sk) ((SSL_COMP *)OPENSSL_sk_pop(ossl_check_SSL_COMP_sk_type(sk))) +#define sk_SSL_COMP_shift(sk) ((SSL_COMP *)OPENSSL_sk_shift(ossl_check_SSL_COMP_sk_type(sk))) +#define sk_SSL_COMP_pop_free(sk, freefunc) OPENSSL_sk_pop_free(ossl_check_SSL_COMP_sk_type(sk),ossl_check_SSL_COMP_freefunc_type(freefunc)) +#define sk_SSL_COMP_insert(sk, ptr, idx) OPENSSL_sk_insert(ossl_check_SSL_COMP_sk_type(sk), ossl_check_SSL_COMP_type(ptr), (idx)) +#define sk_SSL_COMP_set(sk, idx, ptr) ((SSL_COMP *)OPENSSL_sk_set(ossl_check_SSL_COMP_sk_type(sk), (idx), ossl_check_SSL_COMP_type(ptr))) +#define sk_SSL_COMP_find(sk, ptr) OPENSSL_sk_find(ossl_check_SSL_COMP_sk_type(sk), ossl_check_SSL_COMP_type(ptr)) +#define sk_SSL_COMP_find_ex(sk, ptr) OPENSSL_sk_find_ex(ossl_check_SSL_COMP_sk_type(sk), ossl_check_SSL_COMP_type(ptr)) +#define sk_SSL_COMP_find_all(sk, ptr, pnum) OPENSSL_sk_find_all(ossl_check_SSL_COMP_sk_type(sk), ossl_check_SSL_COMP_type(ptr), pnum) +#define sk_SSL_COMP_sort(sk) OPENSSL_sk_sort(ossl_check_SSL_COMP_sk_type(sk)) +#define sk_SSL_COMP_is_sorted(sk) OPENSSL_sk_is_sorted(ossl_check_const_SSL_COMP_sk_type(sk)) +#define sk_SSL_COMP_dup(sk) ((STACK_OF(SSL_COMP) *)OPENSSL_sk_dup(ossl_check_const_SSL_COMP_sk_type(sk))) +#define sk_SSL_COMP_deep_copy(sk, copyfunc, freefunc) ((STACK_OF(SSL_COMP) *)OPENSSL_sk_deep_copy(ossl_check_const_SSL_COMP_sk_type(sk), ossl_check_SSL_COMP_copyfunc_type(copyfunc), ossl_check_SSL_COMP_freefunc_type(freefunc))) +#define sk_SSL_COMP_set_cmp_func(sk, cmp) ((sk_SSL_COMP_compfunc)OPENSSL_sk_set_cmp_func(ossl_check_SSL_COMP_sk_type(sk), ossl_check_SSL_COMP_compfunc_type(cmp))) + + + +# ifdef __cplusplus +} +# endif +#endif diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/conf.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/conf.h index b255b5a1f486..195bb014db26 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/conf.h +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/conf.h @@ -27,6 +27,9 @@ # include # include # include +# ifndef OPENSSL_NO_STDIO +# include +# endif #ifdef __cplusplus extern "C" { @@ -65,7 +68,7 @@ SKM_DEFINE_STACK_OF_INTERNAL(CONF_VALUE, CONF_VALUE, CONF_VALUE) #define sk_CONF_VALUE_deep_copy(sk, copyfunc, freefunc) ((STACK_OF(CONF_VALUE) *)OPENSSL_sk_deep_copy(ossl_check_const_CONF_VALUE_sk_type(sk), ossl_check_CONF_VALUE_copyfunc_type(copyfunc), ossl_check_CONF_VALUE_freefunc_type(freefunc))) #define sk_CONF_VALUE_set_cmp_func(sk, cmp) ((sk_CONF_VALUE_compfunc)OPENSSL_sk_set_cmp_func(ossl_check_CONF_VALUE_sk_type(sk), ossl_check_CONF_VALUE_compfunc_type(cmp))) DEFINE_LHASH_OF_INTERNAL(CONF_VALUE); -#define lh_CONF_VALUE_new(hfn, cmp) ((LHASH_OF(CONF_VALUE) *)OPENSSL_LH_new(ossl_check_CONF_VALUE_lh_hashfunc_type(hfn), ossl_check_CONF_VALUE_lh_compfunc_type(cmp))) +#define lh_CONF_VALUE_new(hfn, cmp) ((LHASH_OF(CONF_VALUE) *)OPENSSL_LH_set_thunks(OPENSSL_LH_new(ossl_check_CONF_VALUE_lh_hashfunc_type(hfn), ossl_check_CONF_VALUE_lh_compfunc_type(cmp)), lh_CONF_VALUE_hash_thunk, lh_CONF_VALUE_comp_thunk, lh_CONF_VALUE_doall_thunk, lh_CONF_VALUE_doall_arg_thunk)) #define lh_CONF_VALUE_free(lh) OPENSSL_LH_free(ossl_check_CONF_VALUE_lh_type(lh)) #define lh_CONF_VALUE_flush(lh) OPENSSL_LH_flush(ossl_check_CONF_VALUE_lh_type(lh)) #define lh_CONF_VALUE_insert(lh, ptr) ((CONF_VALUE *)OPENSSL_LH_insert(ossl_check_CONF_VALUE_lh_type(lh), ossl_check_CONF_VALUE_lh_plain_type(ptr))) diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-ec.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-ec.h index 437ede74d7ac..a292da4b5b93 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-ec.h +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-ec.h @@ -43,6 +43,9 @@ extern "C" { # ifndef OPENSSL_NO_APPS # define OPENSSL_NO_APPS # endif +# ifndef OPENSSL_NO_ARGON2 +# define OPENSSL_NO_ARGON2 +# endif # ifndef OPENSSL_NO_ARIA # define OPENSSL_NO_ARIA # endif @@ -64,6 +67,12 @@ extern "C" { # ifndef OPENSSL_NO_BLAKE2 # define OPENSSL_NO_BLAKE2 # endif +# ifndef OPENSSL_NO_BROTLI +# define OPENSSL_NO_BROTLI +# endif +# ifndef OPENSSL_NO_BROTLI_DYNAMIC +# define OPENSSL_NO_BROTLI_DYNAMIC +# endif # ifndef OPENSSL_NO_CAMELLIA # define OPENSSL_NO_CAMELLIA # endif @@ -97,6 +106,12 @@ extern "C" { # ifndef OPENSSL_NO_CT # define OPENSSL_NO_CT # endif +# ifndef OPENSSL_NO_DEFAULT_THREAD_POOL +# define OPENSSL_NO_DEFAULT_THREAD_POOL +# endif +# ifndef OPENSSL_NO_DEMOS +# define OPENSSL_NO_DEMOS +# endif # ifndef OPENSSL_NO_DEPRECATED # define OPENSSL_NO_DEPRECATED # endif @@ -151,6 +166,9 @@ extern "C" { # ifndef OPENSSL_NO_FILENAMES # define OPENSSL_NO_FILENAMES # endif +# ifndef OPENSSL_NO_FIPS_POST +# define OPENSSL_NO_FIPS_POST +# endif # ifndef OPENSSL_NO_FIPS_SECURITYCHECKS # define OPENSSL_NO_FIPS_SECURITYCHECKS # endif @@ -163,9 +181,15 @@ extern "C" { # ifndef OPENSSL_NO_GOST # define OPENSSL_NO_GOST # endif +# ifndef OPENSSL_NO_H3DEMO +# define OPENSSL_NO_H3DEMO +# endif # ifndef OPENSSL_NO_IDEA # define OPENSSL_NO_IDEA # endif +# ifndef OPENSSL_NO_JITTER +# define OPENSSL_NO_JITTER +# endif # ifndef OPENSSL_NO_KTLS # define OPENSSL_NO_KTLS # endif @@ -199,6 +223,9 @@ extern "C" { # ifndef OPENSSL_NO_PADLOCKENG # define OPENSSL_NO_PADLOCKENG # endif +# ifndef OPENSSL_NO_PIE +# define OPENSSL_NO_PIE +# endif # ifndef OPENSSL_NO_POLY1305 # define OPENSSL_NO_POLY1305 # endif @@ -208,6 +235,12 @@ extern "C" { # ifndef OPENSSL_NO_PSK # define OPENSSL_NO_PSK # endif +# ifndef OPENSSL_NO_QLOG +# define OPENSSL_NO_QLOG +# endif +# ifndef OPENSSL_NO_QUIC +# define OPENSSL_NO_QUIC +# endif # ifndef OPENSSL_NO_RC2 # define OPENSSL_NO_RC2 # endif @@ -268,6 +301,12 @@ extern "C" { # ifndef OPENSSL_NO_TESTS # define OPENSSL_NO_TESTS # endif +# ifndef OPENSSL_NO_TFO +# define OPENSSL_NO_TFO +# endif +# ifndef OPENSSL_NO_THREAD_POOL +# define OPENSSL_NO_THREAD_POOL +# endif # ifndef OPENSSL_NO_TLS1_3 # define OPENSSL_NO_TLS1_3 # endif @@ -286,6 +325,9 @@ extern "C" { # ifndef OPENSSL_NO_UNIT_TEST # define OPENSSL_NO_UNIT_TEST # endif +# ifndef OPENSSL_NO_UNSTABLE_QLOG +# define OPENSSL_NO_UNSTABLE_QLOG +# endif # ifndef OPENSSL_NO_UPLINK # define OPENSSL_NO_UPLINK # endif @@ -295,6 +337,21 @@ extern "C" { # ifndef OPENSSL_NO_WHIRLPOOL # define OPENSSL_NO_WHIRLPOOL # endif +# ifndef OPENSSL_NO_WINSTORE +# define OPENSSL_NO_WINSTORE +# endif +# ifndef OPENSSL_NO_ZLIB +# define OPENSSL_NO_ZLIB +# endif +# ifndef OPENSSL_NO_ZLIB_DYNAMIC +# define OPENSSL_NO_ZLIB_DYNAMIC +# endif +# ifndef OPENSSL_NO_ZSTD +# define OPENSSL_NO_ZSTD +# endif +# ifndef OPENSSL_NO_ZSTD_DYNAMIC +# define OPENSSL_NO_ZSTD_DYNAMIC +# endif # ifndef OPENSSL_NO_DYNAMIC_ENGINE # define OPENSSL_NO_DYNAMIC_ENGINE # endif @@ -316,6 +373,12 @@ extern "C" { # define RC4_INT unsigned int +# if defined(OPENSSL_NO_COMP) || (defined(OPENSSL_NO_BROTLI) && defined(OPENSSL_NO_ZSTD) && defined(OPENSSL_NO_ZLIB)) +# define OPENSSL_NO_COMP_ALG +# else +# undef OPENSSL_NO_COMP_ALG +# endif + # ifdef __cplusplus } # endif diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-noec.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-noec.h index 018225780b3b..14bb6dca8b01 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-noec.h +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-noec.h @@ -43,6 +43,9 @@ extern "C" { # ifndef OPENSSL_NO_APPS # define OPENSSL_NO_APPS # endif +# ifndef OPENSSL_NO_ARGON2 +# define OPENSSL_NO_ARGON2 +# endif # ifndef OPENSSL_NO_ARIA # define OPENSSL_NO_ARIA # endif @@ -64,6 +67,12 @@ extern "C" { # ifndef OPENSSL_NO_BLAKE2 # define OPENSSL_NO_BLAKE2 # endif +# ifndef OPENSSL_NO_BROTLI +# define OPENSSL_NO_BROTLI +# endif +# ifndef OPENSSL_NO_BROTLI_DYNAMIC +# define OPENSSL_NO_BROTLI_DYNAMIC +# endif # ifndef OPENSSL_NO_CAMELLIA # define OPENSSL_NO_CAMELLIA # endif @@ -97,6 +106,12 @@ extern "C" { # ifndef OPENSSL_NO_CT # define OPENSSL_NO_CT # endif +# ifndef OPENSSL_NO_DEFAULT_THREAD_POOL +# define OPENSSL_NO_DEFAULT_THREAD_POOL +# endif +# ifndef OPENSSL_NO_DEMOS +# define OPENSSL_NO_DEMOS +# endif # ifndef OPENSSL_NO_DEPRECATED # define OPENSSL_NO_DEPRECATED # endif @@ -145,6 +160,9 @@ extern "C" { # ifndef OPENSSL_NO_ECDSA # define OPENSSL_NO_ECDSA # endif +# ifndef OPENSSL_NO_ECX +# define OPENSSL_NO_ECX +# endif # ifndef OPENSSL_NO_EGD # define OPENSSL_NO_EGD # endif @@ -160,6 +178,9 @@ extern "C" { # ifndef OPENSSL_NO_FILENAMES # define OPENSSL_NO_FILENAMES # endif +# ifndef OPENSSL_NO_FIPS_POST +# define OPENSSL_NO_FIPS_POST +# endif # ifndef OPENSSL_NO_FIPS_SECURITYCHECKS # define OPENSSL_NO_FIPS_SECURITYCHECKS # endif @@ -172,9 +193,15 @@ extern "C" { # ifndef OPENSSL_NO_GOST # define OPENSSL_NO_GOST # endif +# ifndef OPENSSL_NO_H3DEMO +# define OPENSSL_NO_H3DEMO +# endif # ifndef OPENSSL_NO_IDEA # define OPENSSL_NO_IDEA # endif +# ifndef OPENSSL_NO_JITTER +# define OPENSSL_NO_JITTER +# endif # ifndef OPENSSL_NO_KTLS # define OPENSSL_NO_KTLS # endif @@ -208,6 +235,9 @@ extern "C" { # ifndef OPENSSL_NO_PADLOCKENG # define OPENSSL_NO_PADLOCKENG # endif +# ifndef OPENSSL_NO_PIE +# define OPENSSL_NO_PIE +# endif # ifndef OPENSSL_NO_POLY1305 # define OPENSSL_NO_POLY1305 # endif @@ -217,6 +247,12 @@ extern "C" { # ifndef OPENSSL_NO_PSK # define OPENSSL_NO_PSK # endif +# ifndef OPENSSL_NO_QLOG +# define OPENSSL_NO_QLOG +# endif +# ifndef OPENSSL_NO_QUIC +# define OPENSSL_NO_QUIC +# endif # ifndef OPENSSL_NO_RC2 # define OPENSSL_NO_RC2 # endif @@ -277,6 +313,12 @@ extern "C" { # ifndef OPENSSL_NO_TESTS # define OPENSSL_NO_TESTS # endif +# ifndef OPENSSL_NO_TFO +# define OPENSSL_NO_TFO +# endif +# ifndef OPENSSL_NO_THREAD_POOL +# define OPENSSL_NO_THREAD_POOL +# endif # ifndef OPENSSL_NO_TLS1_3 # define OPENSSL_NO_TLS1_3 # endif @@ -295,6 +337,9 @@ extern "C" { # ifndef OPENSSL_NO_UNIT_TEST # define OPENSSL_NO_UNIT_TEST # endif +# ifndef OPENSSL_NO_UNSTABLE_QLOG +# define OPENSSL_NO_UNSTABLE_QLOG +# endif # ifndef OPENSSL_NO_UPLINK # define OPENSSL_NO_UPLINK # endif @@ -304,6 +349,21 @@ extern "C" { # ifndef OPENSSL_NO_WHIRLPOOL # define OPENSSL_NO_WHIRLPOOL # endif +# ifndef OPENSSL_NO_WINSTORE +# define OPENSSL_NO_WINSTORE +# endif +# ifndef OPENSSL_NO_ZLIB +# define OPENSSL_NO_ZLIB +# endif +# ifndef OPENSSL_NO_ZLIB_DYNAMIC +# define OPENSSL_NO_ZLIB_DYNAMIC +# endif +# ifndef OPENSSL_NO_ZSTD +# define OPENSSL_NO_ZSTD +# endif +# ifndef OPENSSL_NO_ZSTD_DYNAMIC +# define OPENSSL_NO_ZSTD_DYNAMIC +# endif # ifndef OPENSSL_NO_DYNAMIC_ENGINE # define OPENSSL_NO_DYNAMIC_ENGINE # endif @@ -325,6 +385,12 @@ extern "C" { # define RC4_INT unsigned int +# if defined(OPENSSL_NO_COMP) || (defined(OPENSSL_NO_BROTLI) && defined(OPENSSL_NO_ZSTD) && defined(OPENSSL_NO_ZLIB)) +# define OPENSSL_NO_COMP_ALG +# else +# undef OPENSSL_NO_COMP_ALG +# endif + # ifdef __cplusplus } # endif diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/core_names.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/core_names.h new file mode 100644 index 000000000000..072a6b8a8cc9 --- /dev/null +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/core_names.h @@ -0,0 +1,545 @@ +/* + * WARNING: do not edit! + * Generated by Makefile from include/openssl/core_names.h.in + * + * Copyright 2019-2023 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + + +#ifndef OPENSSL_CORE_NAMES_H +# define OPENSSL_CORE_NAMES_H +# pragma once + +# ifdef __cplusplus +extern "C" { +# endif + +/* OSSL_CIPHER_PARAM_CTS_MODE Values */ +# define OSSL_CIPHER_CTS_MODE_CS1 "CS1" +# define OSSL_CIPHER_CTS_MODE_CS2 "CS2" +# define OSSL_CIPHER_CTS_MODE_CS3 "CS3" + +/* Known CIPHER names (not a complete list) */ +# define OSSL_CIPHER_NAME_AES_128_GCM_SIV "AES-128-GCM-SIV" +# define OSSL_CIPHER_NAME_AES_192_GCM_SIV "AES-192-GCM-SIV" +# define OSSL_CIPHER_NAME_AES_256_GCM_SIV "AES-256-GCM-SIV" + +/* Known DIGEST names (not a complete list) */ +# define OSSL_DIGEST_NAME_MD5 "MD5" +# define OSSL_DIGEST_NAME_MD5_SHA1 "MD5-SHA1" +# define OSSL_DIGEST_NAME_SHA1 "SHA1" +# define OSSL_DIGEST_NAME_SHA2_224 "SHA2-224" +# define OSSL_DIGEST_NAME_SHA2_256 "SHA2-256" +# define OSSL_DIGEST_NAME_SHA2_256_192 "SHA2-256/192" +# define OSSL_DIGEST_NAME_SHA2_384 "SHA2-384" +# define OSSL_DIGEST_NAME_SHA2_512 "SHA2-512" +# define OSSL_DIGEST_NAME_SHA2_512_224 "SHA2-512/224" +# define OSSL_DIGEST_NAME_SHA2_512_256 "SHA2-512/256" +# define OSSL_DIGEST_NAME_MD2 "MD2" +# define OSSL_DIGEST_NAME_MD4 "MD4" +# define OSSL_DIGEST_NAME_MDC2 "MDC2" +# define OSSL_DIGEST_NAME_RIPEMD160 "RIPEMD160" +# define OSSL_DIGEST_NAME_SHA3_224 "SHA3-224" +# define OSSL_DIGEST_NAME_SHA3_256 "SHA3-256" +# define OSSL_DIGEST_NAME_SHA3_384 "SHA3-384" +# define OSSL_DIGEST_NAME_SHA3_512 "SHA3-512" +# define OSSL_DIGEST_NAME_KECCAK_KMAC128 "KECCAK-KMAC-128" +# define OSSL_DIGEST_NAME_KECCAK_KMAC256 "KECCAK-KMAC-256" +# define OSSL_DIGEST_NAME_SM3 "SM3" + +/* Known MAC names */ +# define OSSL_MAC_NAME_BLAKE2BMAC "BLAKE2BMAC" +# define OSSL_MAC_NAME_BLAKE2SMAC "BLAKE2SMAC" +# define OSSL_MAC_NAME_CMAC "CMAC" +# define OSSL_MAC_NAME_GMAC "GMAC" +# define OSSL_MAC_NAME_HMAC "HMAC" +# define OSSL_MAC_NAME_KMAC128 "KMAC128" +# define OSSL_MAC_NAME_KMAC256 "KMAC256" +# define OSSL_MAC_NAME_POLY1305 "POLY1305" +# define OSSL_MAC_NAME_SIPHASH "SIPHASH" + +/* Known KDF names */ +# define OSSL_KDF_NAME_HKDF "HKDF" +# define OSSL_KDF_NAME_TLS1_3_KDF "TLS13-KDF" +# define OSSL_KDF_NAME_PBKDF1 "PBKDF1" +# define OSSL_KDF_NAME_PBKDF2 "PBKDF2" +# define OSSL_KDF_NAME_SCRYPT "SCRYPT" +# define OSSL_KDF_NAME_SSHKDF "SSHKDF" +# define OSSL_KDF_NAME_SSKDF "SSKDF" +# define OSSL_KDF_NAME_TLS1_PRF "TLS1-PRF" +# define OSSL_KDF_NAME_X942KDF_ASN1 "X942KDF-ASN1" +# define OSSL_KDF_NAME_X942KDF_CONCAT "X942KDF-CONCAT" +# define OSSL_KDF_NAME_X963KDF "X963KDF" +# define OSSL_KDF_NAME_KBKDF "KBKDF" +# define OSSL_KDF_NAME_KRB5KDF "KRB5KDF" +# define OSSL_KDF_NAME_HMACDRBGKDF "HMAC-DRBG-KDF" + +/* RSA padding modes */ +# define OSSL_PKEY_RSA_PAD_MODE_NONE "none" +# define OSSL_PKEY_RSA_PAD_MODE_PKCSV15 "pkcs1" +# define OSSL_PKEY_RSA_PAD_MODE_OAEP "oaep" +# define OSSL_PKEY_RSA_PAD_MODE_X931 "x931" +# define OSSL_PKEY_RSA_PAD_MODE_PSS "pss" + +/* RSA pss padding salt length */ +# define OSSL_PKEY_RSA_PSS_SALT_LEN_DIGEST "digest" +# define OSSL_PKEY_RSA_PSS_SALT_LEN_MAX "max" +# define OSSL_PKEY_RSA_PSS_SALT_LEN_AUTO "auto" +# define OSSL_PKEY_RSA_PSS_SALT_LEN_AUTO_DIGEST_MAX "auto-digestmax" + +/* OSSL_PKEY_PARAM_EC_ENCODING values */ +# define OSSL_PKEY_EC_ENCODING_EXPLICIT "explicit" +# define OSSL_PKEY_EC_ENCODING_GROUP "named_curve" + +# define OSSL_PKEY_EC_POINT_CONVERSION_FORMAT_UNCOMPRESSED "uncompressed" +# define OSSL_PKEY_EC_POINT_CONVERSION_FORMAT_COMPRESSED "compressed" +# define OSSL_PKEY_EC_POINT_CONVERSION_FORMAT_HYBRID "hybrid" + +# define OSSL_PKEY_EC_GROUP_CHECK_DEFAULT "default" +# define OSSL_PKEY_EC_GROUP_CHECK_NAMED "named" +# define OSSL_PKEY_EC_GROUP_CHECK_NAMED_NIST "named-nist" + +/* OSSL_KEM_PARAM_OPERATION values */ +#define OSSL_KEM_PARAM_OPERATION_RSASVE "RSASVE" +#define OSSL_KEM_PARAM_OPERATION_DHKEM "DHKEM" + +/* Parameter name definitions - generated by util/perl/OpenSSL/paramnames.pm */ +# define OSSL_ALG_PARAM_ALGORITHM_ID "algorithm-id" +# define OSSL_ALG_PARAM_ALGORITHM_ID_PARAMS "algorithm-id-params" +# define OSSL_ALG_PARAM_CIPHER "cipher" +# define OSSL_ALG_PARAM_DIGEST "digest" +# define OSSL_ALG_PARAM_ENGINE "engine" +# define OSSL_ALG_PARAM_FIPS_APPROVED_INDICATOR "fips-indicator" +# define OSSL_ALG_PARAM_MAC "mac" +# define OSSL_ALG_PARAM_PROPERTIES "properties" +# define OSSL_ASYM_CIPHER_PARAM_DIGEST OSSL_PKEY_PARAM_DIGEST +# define OSSL_ASYM_CIPHER_PARAM_ENGINE OSSL_PKEY_PARAM_ENGINE +# define OSSL_ASYM_CIPHER_PARAM_FIPS_APPROVED_INDICATOR OSSL_ALG_PARAM_FIPS_APPROVED_INDICATOR +# define OSSL_ASYM_CIPHER_PARAM_FIPS_KEY_CHECK OSSL_PKEY_PARAM_FIPS_KEY_CHECK +# define OSSL_ASYM_CIPHER_PARAM_FIPS_RSA_PKCS15_PAD_DISABLED OSSL_PROV_PARAM_RSA_PKCS15_PAD_DISABLED +# define OSSL_ASYM_CIPHER_PARAM_IMPLICIT_REJECTION "implicit-rejection" +# define OSSL_ASYM_CIPHER_PARAM_MGF1_DIGEST OSSL_PKEY_PARAM_MGF1_DIGEST +# define OSSL_ASYM_CIPHER_PARAM_MGF1_DIGEST_PROPS OSSL_PKEY_PARAM_MGF1_PROPERTIES +# define OSSL_ASYM_CIPHER_PARAM_OAEP_DIGEST OSSL_ALG_PARAM_DIGEST +# define OSSL_ASYM_CIPHER_PARAM_OAEP_DIGEST_PROPS "digest-props" +# define OSSL_ASYM_CIPHER_PARAM_OAEP_LABEL "oaep-label" +# define OSSL_ASYM_CIPHER_PARAM_PAD_MODE OSSL_PKEY_PARAM_PAD_MODE +# define OSSL_ASYM_CIPHER_PARAM_PROPERTIES OSSL_PKEY_PARAM_PROPERTIES +# define OSSL_ASYM_CIPHER_PARAM_TLS_CLIENT_VERSION "tls-client-version" +# define OSSL_ASYM_CIPHER_PARAM_TLS_NEGOTIATED_VERSION "tls-negotiated-version" +# define OSSL_CAPABILITY_TLS_GROUP_ALG "tls-group-alg" +# define OSSL_CAPABILITY_TLS_GROUP_ID "tls-group-id" +# define OSSL_CAPABILITY_TLS_GROUP_IS_KEM "tls-group-is-kem" +# define OSSL_CAPABILITY_TLS_GROUP_MAX_DTLS "tls-max-dtls" +# define OSSL_CAPABILITY_TLS_GROUP_MAX_TLS "tls-max-tls" +# define OSSL_CAPABILITY_TLS_GROUP_MIN_DTLS "tls-min-dtls" +# define OSSL_CAPABILITY_TLS_GROUP_MIN_TLS "tls-min-tls" +# define OSSL_CAPABILITY_TLS_GROUP_NAME "tls-group-name" +# define OSSL_CAPABILITY_TLS_GROUP_NAME_INTERNAL "tls-group-name-internal" +# define OSSL_CAPABILITY_TLS_GROUP_SECURITY_BITS "tls-group-sec-bits" +# define OSSL_CAPABILITY_TLS_SIGALG_CODE_POINT "tls-sigalg-code-point" +# define OSSL_CAPABILITY_TLS_SIGALG_HASH_NAME "tls-sigalg-hash-name" +# define OSSL_CAPABILITY_TLS_SIGALG_HASH_OID "tls-sigalg-hash-oid" +# define OSSL_CAPABILITY_TLS_SIGALG_IANA_NAME "tls-sigalg-iana-name" +# define OSSL_CAPABILITY_TLS_SIGALG_KEYTYPE "tls-sigalg-keytype" +# define OSSL_CAPABILITY_TLS_SIGALG_KEYTYPE_OID "tls-sigalg-keytype-oid" +# define OSSL_CAPABILITY_TLS_SIGALG_MAX_TLS "tls-max-tls" +# define OSSL_CAPABILITY_TLS_SIGALG_MIN_TLS "tls-min-tls" +# define OSSL_CAPABILITY_TLS_SIGALG_NAME "tls-sigalg-name" +# define OSSL_CAPABILITY_TLS_SIGALG_OID "tls-sigalg-oid" +# define OSSL_CAPABILITY_TLS_SIGALG_SECURITY_BITS "tls-sigalg-sec-bits" +# define OSSL_CAPABILITY_TLS_SIGALG_SIG_NAME "tls-sigalg-sig-name" +# define OSSL_CAPABILITY_TLS_SIGALG_SIG_OID "tls-sigalg-sig-oid" +# define OSSL_CIPHER_PARAM_AEAD "aead" +# define OSSL_CIPHER_PARAM_AEAD_IVLEN OSSL_CIPHER_PARAM_IVLEN +# define OSSL_CIPHER_PARAM_AEAD_IV_GENERATED "iv-generated" +# define OSSL_CIPHER_PARAM_AEAD_MAC_KEY "mackey" +# define OSSL_CIPHER_PARAM_AEAD_TAG "tag" +# define OSSL_CIPHER_PARAM_AEAD_TAGLEN "taglen" +# define OSSL_CIPHER_PARAM_AEAD_TLS1_AAD "tlsaad" +# define OSSL_CIPHER_PARAM_AEAD_TLS1_AAD_PAD "tlsaadpad" +# define OSSL_CIPHER_PARAM_AEAD_TLS1_GET_IV_GEN "tlsivgen" +# define OSSL_CIPHER_PARAM_AEAD_TLS1_IV_FIXED "tlsivfixed" +# define OSSL_CIPHER_PARAM_AEAD_TLS1_SET_IV_INV "tlsivinv" +# define OSSL_CIPHER_PARAM_ALGORITHM_ID OSSL_ALG_PARAM_ALGORITHM_ID +# define OSSL_CIPHER_PARAM_ALGORITHM_ID_PARAMS OSSL_ALG_PARAM_ALGORITHM_ID_PARAMS +# define OSSL_CIPHER_PARAM_ALGORITHM_ID_PARAMS_OLD "alg_id_param" +# define OSSL_CIPHER_PARAM_BLOCK_SIZE "blocksize" +# define OSSL_CIPHER_PARAM_CTS "cts" +# define OSSL_CIPHER_PARAM_CTS_MODE "cts_mode" +# define OSSL_CIPHER_PARAM_CUSTOM_IV "custom-iv" +# define OSSL_CIPHER_PARAM_DECRYPT_ONLY "decrypt-only" +# define OSSL_CIPHER_PARAM_FIPS_APPROVED_INDICATOR OSSL_ALG_PARAM_FIPS_APPROVED_INDICATOR +# define OSSL_CIPHER_PARAM_FIPS_ENCRYPT_CHECK "encrypt-check" +# define OSSL_CIPHER_PARAM_HAS_RAND_KEY "has-randkey" +# define OSSL_CIPHER_PARAM_IV "iv" +# define OSSL_CIPHER_PARAM_IVLEN "ivlen" +# define OSSL_CIPHER_PARAM_KEYLEN "keylen" +# define OSSL_CIPHER_PARAM_MODE "mode" +# define OSSL_CIPHER_PARAM_NUM "num" +# define OSSL_CIPHER_PARAM_PADDING "padding" +# define OSSL_CIPHER_PARAM_RANDOM_KEY "randkey" +# define OSSL_CIPHER_PARAM_RC2_KEYBITS "keybits" +# define OSSL_CIPHER_PARAM_ROUNDS "rounds" +# define OSSL_CIPHER_PARAM_SPEED "speed" +# define OSSL_CIPHER_PARAM_TLS1_MULTIBLOCK "tls-multi" +# define OSSL_CIPHER_PARAM_TLS1_MULTIBLOCK_AAD "tls1multi_aad" +# define OSSL_CIPHER_PARAM_TLS1_MULTIBLOCK_AAD_PACKLEN "tls1multi_aadpacklen" +# define OSSL_CIPHER_PARAM_TLS1_MULTIBLOCK_ENC "tls1multi_enc" +# define OSSL_CIPHER_PARAM_TLS1_MULTIBLOCK_ENC_IN "tls1multi_encin" +# define OSSL_CIPHER_PARAM_TLS1_MULTIBLOCK_ENC_LEN "tls1multi_enclen" +# define OSSL_CIPHER_PARAM_TLS1_MULTIBLOCK_INTERLEAVE "tls1multi_interleave" +# define OSSL_CIPHER_PARAM_TLS1_MULTIBLOCK_MAX_BUFSIZE "tls1multi_maxbufsz" +# define OSSL_CIPHER_PARAM_TLS1_MULTIBLOCK_MAX_SEND_FRAGMENT "tls1multi_maxsndfrag" +# define OSSL_CIPHER_PARAM_TLS_MAC "tls-mac" +# define OSSL_CIPHER_PARAM_TLS_MAC_SIZE "tls-mac-size" +# define OSSL_CIPHER_PARAM_TLS_VERSION "tls-version" +# define OSSL_CIPHER_PARAM_UPDATED_IV "updated-iv" +# define OSSL_CIPHER_PARAM_USE_BITS "use-bits" +# define OSSL_CIPHER_PARAM_XTS_STANDARD "xts_standard" +# define OSSL_DECODER_PARAM_PROPERTIES OSSL_ALG_PARAM_PROPERTIES +# define OSSL_DIGEST_PARAM_ALGID_ABSENT "algid-absent" +# define OSSL_DIGEST_PARAM_BLOCK_SIZE "blocksize" +# define OSSL_DIGEST_PARAM_MICALG "micalg" +# define OSSL_DIGEST_PARAM_PAD_TYPE "pad-type" +# define OSSL_DIGEST_PARAM_SIZE "size" +# define OSSL_DIGEST_PARAM_SSL3_MS "ssl3-ms" +# define OSSL_DIGEST_PARAM_XOF "xof" +# define OSSL_DIGEST_PARAM_XOFLEN "xoflen" +# define OSSL_DRBG_PARAM_CIPHER OSSL_ALG_PARAM_CIPHER +# define OSSL_DRBG_PARAM_DIGEST OSSL_ALG_PARAM_DIGEST +# define OSSL_DRBG_PARAM_ENTROPY_REQUIRED "entropy_required" +# define OSSL_DRBG_PARAM_FIPS_APPROVED_INDICATOR OSSL_ALG_PARAM_FIPS_APPROVED_INDICATOR +# define OSSL_DRBG_PARAM_FIPS_DIGEST_CHECK OSSL_PKEY_PARAM_FIPS_DIGEST_CHECK +# define OSSL_DRBG_PARAM_MAC OSSL_ALG_PARAM_MAC +# define OSSL_DRBG_PARAM_MAX_ADINLEN "max_adinlen" +# define OSSL_DRBG_PARAM_MAX_ENTROPYLEN "max_entropylen" +# define OSSL_DRBG_PARAM_MAX_LENGTH "maxium_length" +# define OSSL_DRBG_PARAM_MAX_NONCELEN "max_noncelen" +# define OSSL_DRBG_PARAM_MAX_PERSLEN "max_perslen" +# define OSSL_DRBG_PARAM_MIN_ENTROPYLEN "min_entropylen" +# define OSSL_DRBG_PARAM_MIN_LENGTH "minium_length" +# define OSSL_DRBG_PARAM_MIN_NONCELEN "min_noncelen" +# define OSSL_DRBG_PARAM_PREDICTION_RESISTANCE "prediction_resistance" +# define OSSL_DRBG_PARAM_PROPERTIES OSSL_ALG_PARAM_PROPERTIES +# define OSSL_DRBG_PARAM_RANDOM_DATA "random_data" +# define OSSL_DRBG_PARAM_RESEED_COUNTER "reseed_counter" +# define OSSL_DRBG_PARAM_RESEED_REQUESTS "reseed_requests" +# define OSSL_DRBG_PARAM_RESEED_TIME "reseed_time" +# define OSSL_DRBG_PARAM_RESEED_TIME_INTERVAL "reseed_time_interval" +# define OSSL_DRBG_PARAM_SIZE "size" +# define OSSL_DRBG_PARAM_USE_DF "use_derivation_function" +# define OSSL_ENCODER_PARAM_CIPHER OSSL_ALG_PARAM_CIPHER +# define OSSL_ENCODER_PARAM_ENCRYPT_LEVEL "encrypt-level" +# define OSSL_ENCODER_PARAM_PROPERTIES OSSL_ALG_PARAM_PROPERTIES +# define OSSL_ENCODER_PARAM_SAVE_PARAMETERS "save-parameters" +# define OSSL_EXCHANGE_PARAM_EC_ECDH_COFACTOR_MODE "ecdh-cofactor-mode" +# define OSSL_EXCHANGE_PARAM_FIPS_APPROVED_INDICATOR OSSL_ALG_PARAM_FIPS_APPROVED_INDICATOR +# define OSSL_EXCHANGE_PARAM_FIPS_DIGEST_CHECK OSSL_PKEY_PARAM_FIPS_DIGEST_CHECK +# define OSSL_EXCHANGE_PARAM_FIPS_ECDH_COFACTOR_CHECK OSSL_PROV_PARAM_ECDH_COFACTOR_CHECK +# define OSSL_EXCHANGE_PARAM_FIPS_KEY_CHECK OSSL_PKEY_PARAM_FIPS_KEY_CHECK +# define OSSL_EXCHANGE_PARAM_KDF_DIGEST "kdf-digest" +# define OSSL_EXCHANGE_PARAM_KDF_DIGEST_PROPS "kdf-digest-props" +# define OSSL_EXCHANGE_PARAM_KDF_OUTLEN "kdf-outlen" +# define OSSL_EXCHANGE_PARAM_KDF_TYPE "kdf-type" +# define OSSL_EXCHANGE_PARAM_KDF_UKM "kdf-ukm" +# define OSSL_EXCHANGE_PARAM_PAD "pad" +# define OSSL_GEN_PARAM_ITERATION "iteration" +# define OSSL_GEN_PARAM_POTENTIAL "potential" +# define OSSL_KDF_PARAM_ARGON2_AD "ad" +# define OSSL_KDF_PARAM_ARGON2_LANES "lanes" +# define OSSL_KDF_PARAM_ARGON2_MEMCOST "memcost" +# define OSSL_KDF_PARAM_ARGON2_VERSION "version" +# define OSSL_KDF_PARAM_CEK_ALG "cekalg" +# define OSSL_KDF_PARAM_CIPHER OSSL_ALG_PARAM_CIPHER +# define OSSL_KDF_PARAM_CONSTANT "constant" +# define OSSL_KDF_PARAM_DATA "data" +# define OSSL_KDF_PARAM_DIGEST OSSL_ALG_PARAM_DIGEST +# define OSSL_KDF_PARAM_EARLY_CLEAN "early_clean" +# define OSSL_KDF_PARAM_FIPS_APPROVED_INDICATOR OSSL_ALG_PARAM_FIPS_APPROVED_INDICATOR +# define OSSL_KDF_PARAM_FIPS_DIGEST_CHECK OSSL_PKEY_PARAM_FIPS_DIGEST_CHECK +# define OSSL_KDF_PARAM_FIPS_EMS_CHECK "ems_check" +# define OSSL_KDF_PARAM_FIPS_KEY_CHECK OSSL_PKEY_PARAM_FIPS_KEY_CHECK +# define OSSL_KDF_PARAM_HMACDRBG_ENTROPY "entropy" +# define OSSL_KDF_PARAM_HMACDRBG_NONCE "nonce" +# define OSSL_KDF_PARAM_INFO "info" +# define OSSL_KDF_PARAM_ITER "iter" +# define OSSL_KDF_PARAM_KBKDF_R "r" +# define OSSL_KDF_PARAM_KBKDF_USE_L "use-l" +# define OSSL_KDF_PARAM_KBKDF_USE_SEPARATOR "use-separator" +# define OSSL_KDF_PARAM_KEY "key" +# define OSSL_KDF_PARAM_LABEL "label" +# define OSSL_KDF_PARAM_MAC OSSL_ALG_PARAM_MAC +# define OSSL_KDF_PARAM_MAC_SIZE "maclen" +# define OSSL_KDF_PARAM_MODE "mode" +# define OSSL_KDF_PARAM_PASSWORD "pass" +# define OSSL_KDF_PARAM_PKCS12_ID "id" +# define OSSL_KDF_PARAM_PKCS5 "pkcs5" +# define OSSL_KDF_PARAM_PREFIX "prefix" +# define OSSL_KDF_PARAM_PROPERTIES OSSL_ALG_PARAM_PROPERTIES +# define OSSL_KDF_PARAM_SALT "salt" +# define OSSL_KDF_PARAM_SCRYPT_MAXMEM "maxmem_bytes" +# define OSSL_KDF_PARAM_SCRYPT_N "n" +# define OSSL_KDF_PARAM_SCRYPT_P "p" +# define OSSL_KDF_PARAM_SCRYPT_R "r" +# define OSSL_KDF_PARAM_SECRET "secret" +# define OSSL_KDF_PARAM_SEED "seed" +# define OSSL_KDF_PARAM_SIZE "size" +# define OSSL_KDF_PARAM_SSHKDF_SESSION_ID "session_id" +# define OSSL_KDF_PARAM_SSHKDF_TYPE "type" +# define OSSL_KDF_PARAM_SSHKDF_XCGHASH "xcghash" +# define OSSL_KDF_PARAM_THREADS "threads" +# define OSSL_KDF_PARAM_UKM "ukm" +# define OSSL_KDF_PARAM_X942_ACVPINFO "acvp-info" +# define OSSL_KDF_PARAM_X942_PARTYUINFO "partyu-info" +# define OSSL_KDF_PARAM_X942_PARTYVINFO "partyv-info" +# define OSSL_KDF_PARAM_X942_SUPP_PRIVINFO "supp-privinfo" +# define OSSL_KDF_PARAM_X942_SUPP_PUBINFO "supp-pubinfo" +# define OSSL_KDF_PARAM_X942_USE_KEYBITS "use-keybits" +# define OSSL_KEM_PARAM_FIPS_APPROVED_INDICATOR OSSL_ALG_PARAM_FIPS_APPROVED_INDICATOR +# define OSSL_KEM_PARAM_FIPS_KEY_CHECK OSSL_PKEY_PARAM_FIPS_KEY_CHECK +# define OSSL_KEM_PARAM_IKME "ikme" +# define OSSL_KEM_PARAM_OPERATION "operation" +# define OSSL_LIBSSL_RECORD_LAYER_PARAM_BLOCK_PADDING "block_padding" +# define OSSL_LIBSSL_RECORD_LAYER_PARAM_HS_PADDING "hs_padding" +# define OSSL_LIBSSL_RECORD_LAYER_PARAM_MAX_EARLY_DATA "max_early_data" +# define OSSL_LIBSSL_RECORD_LAYER_PARAM_MAX_FRAG_LEN "max_frag_len" +# define OSSL_LIBSSL_RECORD_LAYER_PARAM_MODE "mode" +# define OSSL_LIBSSL_RECORD_LAYER_PARAM_OPTIONS "options" +# define OSSL_LIBSSL_RECORD_LAYER_PARAM_READ_AHEAD "read_ahead" +# define OSSL_LIBSSL_RECORD_LAYER_PARAM_STREAM_MAC "stream_mac" +# define OSSL_LIBSSL_RECORD_LAYER_PARAM_TLSTREE "tlstree" +# define OSSL_LIBSSL_RECORD_LAYER_PARAM_USE_ETM "use_etm" +# define OSSL_LIBSSL_RECORD_LAYER_READ_BUFFER_LEN "read_buffer_len" +# define OSSL_MAC_PARAM_BLOCK_SIZE "block-size" +# define OSSL_MAC_PARAM_CIPHER OSSL_ALG_PARAM_CIPHER +# define OSSL_MAC_PARAM_CUSTOM "custom" +# define OSSL_MAC_PARAM_C_ROUNDS "c-rounds" +# define OSSL_MAC_PARAM_DIGEST OSSL_ALG_PARAM_DIGEST +# define OSSL_MAC_PARAM_DIGEST_NOINIT "digest-noinit" +# define OSSL_MAC_PARAM_DIGEST_ONESHOT "digest-oneshot" +# define OSSL_MAC_PARAM_D_ROUNDS "d-rounds" +# define OSSL_MAC_PARAM_FIPS_APPROVED_INDICATOR OSSL_ALG_PARAM_FIPS_APPROVED_INDICATOR +# define OSSL_MAC_PARAM_FIPS_KEY_CHECK OSSL_PKEY_PARAM_FIPS_KEY_CHECK +# define OSSL_MAC_PARAM_FIPS_NO_SHORT_MAC OSSL_PROV_PARAM_NO_SHORT_MAC +# define OSSL_MAC_PARAM_IV "iv" +# define OSSL_MAC_PARAM_KEY "key" +# define OSSL_MAC_PARAM_PROPERTIES OSSL_ALG_PARAM_PROPERTIES +# define OSSL_MAC_PARAM_SALT "salt" +# define OSSL_MAC_PARAM_SIZE "size" +# define OSSL_MAC_PARAM_TLS_DATA_SIZE "tls-data-size" +# define OSSL_MAC_PARAM_XOF "xof" +# define OSSL_OBJECT_PARAM_DATA "data" +# define OSSL_OBJECT_PARAM_DATA_STRUCTURE "data-structure" +# define OSSL_OBJECT_PARAM_DATA_TYPE "data-type" +# define OSSL_OBJECT_PARAM_DESC "desc" +# define OSSL_OBJECT_PARAM_REFERENCE "reference" +# define OSSL_OBJECT_PARAM_TYPE "type" +# define OSSL_PASSPHRASE_PARAM_INFO "info" +# define OSSL_PKEY_PARAM_ALGORITHM_ID OSSL_ALG_PARAM_ALGORITHM_ID +# define OSSL_PKEY_PARAM_ALGORITHM_ID_PARAMS OSSL_ALG_PARAM_ALGORITHM_ID_PARAMS +# define OSSL_PKEY_PARAM_BITS "bits" +# define OSSL_PKEY_PARAM_CIPHER OSSL_ALG_PARAM_CIPHER +# define OSSL_PKEY_PARAM_DEFAULT_DIGEST "default-digest" +# define OSSL_PKEY_PARAM_DHKEM_IKM "dhkem-ikm" +# define OSSL_PKEY_PARAM_DH_GENERATOR "safeprime-generator" +# define OSSL_PKEY_PARAM_DH_PRIV_LEN "priv_len" +# define OSSL_PKEY_PARAM_DIGEST OSSL_ALG_PARAM_DIGEST +# define OSSL_PKEY_PARAM_DIGEST_SIZE "digest-size" +# define OSSL_PKEY_PARAM_DIST_ID "distid" +# define OSSL_PKEY_PARAM_EC_A "a" +# define OSSL_PKEY_PARAM_EC_B "b" +# define OSSL_PKEY_PARAM_EC_CHAR2_M "m" +# define OSSL_PKEY_PARAM_EC_CHAR2_PP_K1 "k1" +# define OSSL_PKEY_PARAM_EC_CHAR2_PP_K2 "k2" +# define OSSL_PKEY_PARAM_EC_CHAR2_PP_K3 "k3" +# define OSSL_PKEY_PARAM_EC_CHAR2_TP_BASIS "tp" +# define OSSL_PKEY_PARAM_EC_CHAR2_TYPE "basis-type" +# define OSSL_PKEY_PARAM_EC_COFACTOR "cofactor" +# define OSSL_PKEY_PARAM_EC_DECODED_FROM_EXPLICIT_PARAMS "decoded-from-explicit" +# define OSSL_PKEY_PARAM_EC_ENCODING "encoding" +# define OSSL_PKEY_PARAM_EC_FIELD_TYPE "field-type" +# define OSSL_PKEY_PARAM_EC_GENERATOR "generator" +# define OSSL_PKEY_PARAM_EC_GROUP_CHECK_TYPE "group-check" +# define OSSL_PKEY_PARAM_EC_INCLUDE_PUBLIC "include-public" +# define OSSL_PKEY_PARAM_EC_ORDER "order" +# define OSSL_PKEY_PARAM_EC_P "p" +# define OSSL_PKEY_PARAM_EC_POINT_CONVERSION_FORMAT "point-format" +# define OSSL_PKEY_PARAM_EC_PUB_X "qx" +# define OSSL_PKEY_PARAM_EC_PUB_Y "qy" +# define OSSL_PKEY_PARAM_EC_SEED "seed" +# define OSSL_PKEY_PARAM_ENCODED_PUBLIC_KEY "encoded-pub-key" +# define OSSL_PKEY_PARAM_ENGINE OSSL_ALG_PARAM_ENGINE +# define OSSL_PKEY_PARAM_FFC_COFACTOR "j" +# define OSSL_PKEY_PARAM_FFC_DIGEST OSSL_PKEY_PARAM_DIGEST +# define OSSL_PKEY_PARAM_FFC_DIGEST_PROPS OSSL_PKEY_PARAM_PROPERTIES +# define OSSL_PKEY_PARAM_FFC_G "g" +# define OSSL_PKEY_PARAM_FFC_GINDEX "gindex" +# define OSSL_PKEY_PARAM_FFC_H "hindex" +# define OSSL_PKEY_PARAM_FFC_P "p" +# define OSSL_PKEY_PARAM_FFC_PBITS "pbits" +# define OSSL_PKEY_PARAM_FFC_PCOUNTER "pcounter" +# define OSSL_PKEY_PARAM_FFC_Q "q" +# define OSSL_PKEY_PARAM_FFC_QBITS "qbits" +# define OSSL_PKEY_PARAM_FFC_SEED "seed" +# define OSSL_PKEY_PARAM_FFC_TYPE "type" +# define OSSL_PKEY_PARAM_FFC_VALIDATE_G "validate-g" +# define OSSL_PKEY_PARAM_FFC_VALIDATE_LEGACY "validate-legacy" +# define OSSL_PKEY_PARAM_FFC_VALIDATE_PQ "validate-pq" +# define OSSL_PKEY_PARAM_FIPS_APPROVED_INDICATOR OSSL_ALG_PARAM_FIPS_APPROVED_INDICATOR +# define OSSL_PKEY_PARAM_FIPS_DIGEST_CHECK "digest-check" +# define OSSL_PKEY_PARAM_FIPS_KEY_CHECK "key-check" +# define OSSL_PKEY_PARAM_FIPS_SIGN_CHECK "sign-check" +# define OSSL_PKEY_PARAM_GROUP_NAME "group" +# define OSSL_PKEY_PARAM_IMPLICIT_REJECTION "implicit-rejection" +# define OSSL_PKEY_PARAM_MANDATORY_DIGEST "mandatory-digest" +# define OSSL_PKEY_PARAM_MASKGENFUNC "mgf" +# define OSSL_PKEY_PARAM_MAX_SIZE "max-size" +# define OSSL_PKEY_PARAM_MGF1_DIGEST "mgf1-digest" +# define OSSL_PKEY_PARAM_MGF1_PROPERTIES "mgf1-properties" +# define OSSL_PKEY_PARAM_PAD_MODE "pad-mode" +# define OSSL_PKEY_PARAM_PRIV_KEY "priv" +# define OSSL_PKEY_PARAM_PROPERTIES OSSL_ALG_PARAM_PROPERTIES +# define OSSL_PKEY_PARAM_PUB_KEY "pub" +# define OSSL_PKEY_PARAM_RSA_BITS OSSL_PKEY_PARAM_BITS +# define OSSL_PKEY_PARAM_RSA_COEFFICIENT "rsa-coefficient" +# define OSSL_PKEY_PARAM_RSA_COEFFICIENT1 "rsa-coefficient1" +# define OSSL_PKEY_PARAM_RSA_COEFFICIENT2 "rsa-coefficient2" +# define OSSL_PKEY_PARAM_RSA_COEFFICIENT3 "rsa-coefficient3" +# define OSSL_PKEY_PARAM_RSA_COEFFICIENT4 "rsa-coefficient4" +# define OSSL_PKEY_PARAM_RSA_COEFFICIENT5 "rsa-coefficient5" +# define OSSL_PKEY_PARAM_RSA_COEFFICIENT6 "rsa-coefficient6" +# define OSSL_PKEY_PARAM_RSA_COEFFICIENT7 "rsa-coefficient7" +# define OSSL_PKEY_PARAM_RSA_COEFFICIENT8 "rsa-coefficient8" +# define OSSL_PKEY_PARAM_RSA_COEFFICIENT9 "rsa-coefficient9" +# define OSSL_PKEY_PARAM_RSA_D "d" +# define OSSL_PKEY_PARAM_RSA_DERIVE_FROM_PQ "rsa-derive-from-pq" +# define OSSL_PKEY_PARAM_RSA_DIGEST OSSL_PKEY_PARAM_DIGEST +# define OSSL_PKEY_PARAM_RSA_DIGEST_PROPS OSSL_PKEY_PARAM_PROPERTIES +# define OSSL_PKEY_PARAM_RSA_E "e" +# define OSSL_PKEY_PARAM_RSA_EXPONENT "rsa-exponent" +# define OSSL_PKEY_PARAM_RSA_EXPONENT1 "rsa-exponent1" +# define OSSL_PKEY_PARAM_RSA_EXPONENT10 "rsa-exponent10" +# define OSSL_PKEY_PARAM_RSA_EXPONENT2 "rsa-exponent2" +# define OSSL_PKEY_PARAM_RSA_EXPONENT3 "rsa-exponent3" +# define OSSL_PKEY_PARAM_RSA_EXPONENT4 "rsa-exponent4" +# define OSSL_PKEY_PARAM_RSA_EXPONENT5 "rsa-exponent5" +# define OSSL_PKEY_PARAM_RSA_EXPONENT6 "rsa-exponent6" +# define OSSL_PKEY_PARAM_RSA_EXPONENT7 "rsa-exponent7" +# define OSSL_PKEY_PARAM_RSA_EXPONENT8 "rsa-exponent8" +# define OSSL_PKEY_PARAM_RSA_EXPONENT9 "rsa-exponent9" +# define OSSL_PKEY_PARAM_RSA_FACTOR "rsa-factor" +# define OSSL_PKEY_PARAM_RSA_FACTOR1 "rsa-factor1" +# define OSSL_PKEY_PARAM_RSA_FACTOR10 "rsa-factor10" +# define OSSL_PKEY_PARAM_RSA_FACTOR2 "rsa-factor2" +# define OSSL_PKEY_PARAM_RSA_FACTOR3 "rsa-factor3" +# define OSSL_PKEY_PARAM_RSA_FACTOR4 "rsa-factor4" +# define OSSL_PKEY_PARAM_RSA_FACTOR5 "rsa-factor5" +# define OSSL_PKEY_PARAM_RSA_FACTOR6 "rsa-factor6" +# define OSSL_PKEY_PARAM_RSA_FACTOR7 "rsa-factor7" +# define OSSL_PKEY_PARAM_RSA_FACTOR8 "rsa-factor8" +# define OSSL_PKEY_PARAM_RSA_FACTOR9 "rsa-factor9" +# define OSSL_PKEY_PARAM_RSA_MASKGENFUNC OSSL_PKEY_PARAM_MASKGENFUNC +# define OSSL_PKEY_PARAM_RSA_MGF1_DIGEST OSSL_PKEY_PARAM_MGF1_DIGEST +# define OSSL_PKEY_PARAM_RSA_N "n" +# define OSSL_PKEY_PARAM_RSA_PRIMES "primes" +# define OSSL_PKEY_PARAM_RSA_PSS_SALTLEN "saltlen" +# define OSSL_PKEY_PARAM_RSA_TEST_P1 "p1" +# define OSSL_PKEY_PARAM_RSA_TEST_P2 "p2" +# define OSSL_PKEY_PARAM_RSA_TEST_Q1 "q1" +# define OSSL_PKEY_PARAM_RSA_TEST_Q2 "q2" +# define OSSL_PKEY_PARAM_RSA_TEST_XP "xp" +# define OSSL_PKEY_PARAM_RSA_TEST_XP1 "xp1" +# define OSSL_PKEY_PARAM_RSA_TEST_XP2 "xp2" +# define OSSL_PKEY_PARAM_RSA_TEST_XQ "xq" +# define OSSL_PKEY_PARAM_RSA_TEST_XQ1 "xq1" +# define OSSL_PKEY_PARAM_RSA_TEST_XQ2 "xq2" +# define OSSL_PKEY_PARAM_SECURITY_BITS "security-bits" +# define OSSL_PKEY_PARAM_USE_COFACTOR_ECDH OSSL_PKEY_PARAM_USE_COFACTOR_FLAG +# define OSSL_PKEY_PARAM_USE_COFACTOR_FLAG "use-cofactor-flag" +# define OSSL_PROV_PARAM_BUILDINFO "buildinfo" +# define OSSL_PROV_PARAM_CORE_MODULE_FILENAME "module-filename" +# define OSSL_PROV_PARAM_CORE_PROV_NAME "provider-name" +# define OSSL_PROV_PARAM_CORE_VERSION "openssl-version" +# define OSSL_PROV_PARAM_DRBG_TRUNC_DIGEST "drbg-no-trunc-md" +# define OSSL_PROV_PARAM_DSA_SIGN_DISABLED "dsa-sign-disabled" +# define OSSL_PROV_PARAM_ECDH_COFACTOR_CHECK "ecdh-cofactor-check" +# define OSSL_PROV_PARAM_HKDF_DIGEST_CHECK "hkdf-digest-check" +# define OSSL_PROV_PARAM_HKDF_KEY_CHECK "hkdf-key-check" +# define OSSL_PROV_PARAM_HMAC_KEY_CHECK "hmac-key-check" +# define OSSL_PROV_PARAM_KBKDF_KEY_CHECK "kbkdf-key-check" +# define OSSL_PROV_PARAM_KMAC_KEY_CHECK "kmac-key-check" +# define OSSL_PROV_PARAM_NAME "name" +# define OSSL_PROV_PARAM_NO_SHORT_MAC "no-short-mac" +# define OSSL_PROV_PARAM_PBKDF2_LOWER_BOUND_CHECK "pbkdf2-lower-bound-check" +# define OSSL_PROV_PARAM_RSA_PKCS15_PAD_DISABLED "rsa-pkcs15-pad-disabled" +# define OSSL_PROV_PARAM_RSA_PSS_SALTLEN_CHECK "rsa-pss-saltlen-check" +# define OSSL_PROV_PARAM_RSA_SIGN_X931_PAD_DISABLED "rsa-sign-x931-pad-disabled" +# define OSSL_PROV_PARAM_SECURITY_CHECKS "security-checks" +# define OSSL_PROV_PARAM_SELF_TEST_DESC "st-desc" +# define OSSL_PROV_PARAM_SELF_TEST_PHASE "st-phase" +# define OSSL_PROV_PARAM_SELF_TEST_TYPE "st-type" +# define OSSL_PROV_PARAM_SIGNATURE_DIGEST_CHECK "signature-digest-check" +# define OSSL_PROV_PARAM_SSHKDF_DIGEST_CHECK "sshkdf-digest-check" +# define OSSL_PROV_PARAM_SSHKDF_KEY_CHECK "sshkdf-key-check" +# define OSSL_PROV_PARAM_SSKDF_DIGEST_CHECK "sskdf-digest-check" +# define OSSL_PROV_PARAM_SSKDF_KEY_CHECK "sskdf-key-check" +# define OSSL_PROV_PARAM_STATUS "status" +# define OSSL_PROV_PARAM_TDES_ENCRYPT_DISABLED "tdes-encrypt-disabled" +# define OSSL_PROV_PARAM_TLS13_KDF_DIGEST_CHECK "tls13-kdf-digest-check" +# define OSSL_PROV_PARAM_TLS13_KDF_KEY_CHECK "tls13-kdf-key-check" +# define OSSL_PROV_PARAM_TLS1_PRF_DIGEST_CHECK "tls1-prf-digest-check" +# define OSSL_PROV_PARAM_TLS1_PRF_EMS_CHECK "tls1-prf-ems-check" +# define OSSL_PROV_PARAM_TLS1_PRF_KEY_CHECK "tls1-prf-key-check" +# define OSSL_PROV_PARAM_VERSION "version" +# define OSSL_PROV_PARAM_X942KDF_KEY_CHECK "x942kdf-key-check" +# define OSSL_PROV_PARAM_X963KDF_DIGEST_CHECK "x963kdf-digest-check" +# define OSSL_PROV_PARAM_X963KDF_KEY_CHECK "x963kdf-key-check" +# define OSSL_RAND_PARAM_FIPS_APPROVED_INDICATOR OSSL_ALG_PARAM_FIPS_APPROVED_INDICATOR +# define OSSL_RAND_PARAM_GENERATE "generate" +# define OSSL_RAND_PARAM_MAX_REQUEST "max_request" +# define OSSL_RAND_PARAM_STATE "state" +# define OSSL_RAND_PARAM_STRENGTH "strength" +# define OSSL_RAND_PARAM_TEST_ENTROPY "test_entropy" +# define OSSL_RAND_PARAM_TEST_NONCE "test_nonce" +# define OSSL_SIGNATURE_PARAM_ALGORITHM_ID OSSL_PKEY_PARAM_ALGORITHM_ID +# define OSSL_SIGNATURE_PARAM_ALGORITHM_ID_PARAMS OSSL_PKEY_PARAM_ALGORITHM_ID_PARAMS +# define OSSL_SIGNATURE_PARAM_CONTEXT_STRING "context-string" +# define OSSL_SIGNATURE_PARAM_DIGEST OSSL_PKEY_PARAM_DIGEST +# define OSSL_SIGNATURE_PARAM_DIGEST_SIZE OSSL_PKEY_PARAM_DIGEST_SIZE +# define OSSL_SIGNATURE_PARAM_FIPS_APPROVED_INDICATOR OSSL_ALG_PARAM_FIPS_APPROVED_INDICATOR +# define OSSL_SIGNATURE_PARAM_FIPS_DIGEST_CHECK OSSL_PKEY_PARAM_FIPS_DIGEST_CHECK +# define OSSL_SIGNATURE_PARAM_FIPS_KEY_CHECK OSSL_PKEY_PARAM_FIPS_KEY_CHECK +# define OSSL_SIGNATURE_PARAM_FIPS_RSA_PSS_SALTLEN_CHECK "rsa-pss-saltlen-check" +# define OSSL_SIGNATURE_PARAM_FIPS_SIGN_CHECK OSSL_PKEY_PARAM_FIPS_SIGN_CHECK +# define OSSL_SIGNATURE_PARAM_FIPS_SIGN_X931_PAD_CHECK "sign-x931-pad-check" +# define OSSL_SIGNATURE_PARAM_FIPS_VERIFY_MESSAGE "verify-message" +# define OSSL_SIGNATURE_PARAM_INSTANCE "instance" +# define OSSL_SIGNATURE_PARAM_KAT "kat" +# define OSSL_SIGNATURE_PARAM_MGF1_DIGEST OSSL_PKEY_PARAM_MGF1_DIGEST +# define OSSL_SIGNATURE_PARAM_MGF1_PROPERTIES OSSL_PKEY_PARAM_MGF1_PROPERTIES +# define OSSL_SIGNATURE_PARAM_NONCE_TYPE "nonce-type" +# define OSSL_SIGNATURE_PARAM_PAD_MODE OSSL_PKEY_PARAM_PAD_MODE +# define OSSL_SIGNATURE_PARAM_PROPERTIES OSSL_PKEY_PARAM_PROPERTIES +# define OSSL_SIGNATURE_PARAM_PSS_SALTLEN "saltlen" +# define OSSL_SIGNATURE_PARAM_SIGNATURE "signature" +# define OSSL_STORE_PARAM_ALIAS "alias" +# define OSSL_STORE_PARAM_DIGEST "digest" +# define OSSL_STORE_PARAM_EXPECT "expect" +# define OSSL_STORE_PARAM_FINGERPRINT "fingerprint" +# define OSSL_STORE_PARAM_INPUT_TYPE "input-type" +# define OSSL_STORE_PARAM_ISSUER "name" +# define OSSL_STORE_PARAM_PROPERTIES "properties" +# define OSSL_STORE_PARAM_SERIAL "serial" +# define OSSL_STORE_PARAM_SUBJECT "subject" + +# ifdef __cplusplus +} +# endif + +#endif diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/crmf.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/crmf.h index fec55fe5b086..9900edfddec6 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/crmf.h +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/crmf.h @@ -2,7 +2,7 @@ * WARNING: do not edit! * Generated by Makefile from include/openssl/crmf.h.in * - * Copyright 2007-2021 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 2007-2024 The OpenSSL Project Authors. All Rights Reserved. * Copyright Nokia 2007-2019 * Copyright Siemens AG 2015-2019 * @@ -43,8 +43,8 @@ extern "C" { # define OSSL_CRMF_SUBSEQUENTMESSAGE_ENCRCERT 0 # define OSSL_CRMF_SUBSEQUENTMESSAGE_CHALLENGERESP 1 - typedef struct ossl_crmf_encryptedvalue_st OSSL_CRMF_ENCRYPTEDVALUE; + DECLARE_ASN1_FUNCTIONS(OSSL_CRMF_ENCRYPTEDVALUE) typedef struct ossl_crmf_msg_st OSSL_CRMF_MSG; DECLARE_ASN1_FUNCTIONS(OSSL_CRMF_MSG) @@ -77,6 +77,36 @@ SKM_DEFINE_STACK_OF_INTERNAL(OSSL_CRMF_MSG, OSSL_CRMF_MSG, OSSL_CRMF_MSG) #define sk_OSSL_CRMF_MSG_set_cmp_func(sk, cmp) ((sk_OSSL_CRMF_MSG_compfunc)OPENSSL_sk_set_cmp_func(ossl_check_OSSL_CRMF_MSG_sk_type(sk), ossl_check_OSSL_CRMF_MSG_compfunc_type(cmp))) typedef struct ossl_crmf_attributetypeandvalue_st OSSL_CRMF_ATTRIBUTETYPEANDVALUE; +void OSSL_CRMF_ATTRIBUTETYPEANDVALUE_free(OSSL_CRMF_ATTRIBUTETYPEANDVALUE *v); +DECLARE_ASN1_DUP_FUNCTION(OSSL_CRMF_ATTRIBUTETYPEANDVALUE) +SKM_DEFINE_STACK_OF_INTERNAL(OSSL_CRMF_ATTRIBUTETYPEANDVALUE, OSSL_CRMF_ATTRIBUTETYPEANDVALUE, OSSL_CRMF_ATTRIBUTETYPEANDVALUE) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_num(sk) OPENSSL_sk_num(ossl_check_const_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk)) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_value(sk, idx) ((OSSL_CRMF_ATTRIBUTETYPEANDVALUE *)OPENSSL_sk_value(ossl_check_const_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk), (idx))) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_new(cmp) ((STACK_OF(OSSL_CRMF_ATTRIBUTETYPEANDVALUE) *)OPENSSL_sk_new(ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_compfunc_type(cmp))) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_new_null() ((STACK_OF(OSSL_CRMF_ATTRIBUTETYPEANDVALUE) *)OPENSSL_sk_new_null()) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_new_reserve(cmp, n) ((STACK_OF(OSSL_CRMF_ATTRIBUTETYPEANDVALUE) *)OPENSSL_sk_new_reserve(ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_compfunc_type(cmp), (n))) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_reserve(sk, n) OPENSSL_sk_reserve(ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk), (n)) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_free(sk) OPENSSL_sk_free(ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk)) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_zero(sk) OPENSSL_sk_zero(ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk)) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_delete(sk, i) ((OSSL_CRMF_ATTRIBUTETYPEANDVALUE *)OPENSSL_sk_delete(ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk), (i))) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_delete_ptr(sk, ptr) ((OSSL_CRMF_ATTRIBUTETYPEANDVALUE *)OPENSSL_sk_delete_ptr(ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk), ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_type(ptr))) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_push(sk, ptr) OPENSSL_sk_push(ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk), ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_type(ptr)) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_unshift(sk, ptr) OPENSSL_sk_unshift(ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk), ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_type(ptr)) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_pop(sk) ((OSSL_CRMF_ATTRIBUTETYPEANDVALUE *)OPENSSL_sk_pop(ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk))) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_shift(sk) ((OSSL_CRMF_ATTRIBUTETYPEANDVALUE *)OPENSSL_sk_shift(ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk))) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_pop_free(sk, freefunc) OPENSSL_sk_pop_free(ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk),ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_freefunc_type(freefunc)) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_insert(sk, ptr, idx) OPENSSL_sk_insert(ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk), ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_type(ptr), (idx)) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_set(sk, idx, ptr) ((OSSL_CRMF_ATTRIBUTETYPEANDVALUE *)OPENSSL_sk_set(ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk), (idx), ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_type(ptr))) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_find(sk, ptr) OPENSSL_sk_find(ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk), ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_type(ptr)) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_find_ex(sk, ptr) OPENSSL_sk_find_ex(ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk), ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_type(ptr)) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_find_all(sk, ptr, pnum) OPENSSL_sk_find_all(ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk), ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_type(ptr), pnum) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sort(sk) OPENSSL_sk_sort(ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk)) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_is_sorted(sk) OPENSSL_sk_is_sorted(ossl_check_const_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk)) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_dup(sk) ((STACK_OF(OSSL_CRMF_ATTRIBUTETYPEANDVALUE) *)OPENSSL_sk_dup(ossl_check_const_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk))) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_deep_copy(sk, copyfunc, freefunc) ((STACK_OF(OSSL_CRMF_ATTRIBUTETYPEANDVALUE) *)OPENSSL_sk_deep_copy(ossl_check_const_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk), ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_copyfunc_type(copyfunc), ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_freefunc_type(freefunc))) +#define sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_set_cmp_func(sk, cmp) ((sk_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_compfunc)OPENSSL_sk_set_cmp_func(ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_sk_type(sk), ossl_check_OSSL_CRMF_ATTRIBUTETYPEANDVALUE_compfunc_type(cmp))) + + typedef struct ossl_crmf_pbmparameter_st OSSL_CRMF_PBMPARAMETER; DECLARE_ASN1_FUNCTIONS(OSSL_CRMF_PBMPARAMETER) typedef struct ossl_crmf_poposigningkey_st OSSL_CRMF_POPOSIGNINGKEY; @@ -118,6 +148,7 @@ typedef struct ossl_crmf_singlepubinfo_st OSSL_CRMF_SINGLEPUBINFO; DECLARE_ASN1_FUNCTIONS(OSSL_CRMF_SINGLEPUBINFO) typedef struct ossl_crmf_certtemplate_st OSSL_CRMF_CERTTEMPLATE; DECLARE_ASN1_FUNCTIONS(OSSL_CRMF_CERTTEMPLATE) +DECLARE_ASN1_DUP_FUNCTION(OSSL_CRMF_CERTTEMPLATE) typedef STACK_OF(OSSL_CRMF_MSG) OSSL_CRMF_MSGS; DECLARE_ASN1_FUNCTIONS(OSSL_CRMF_MSGS) @@ -198,12 +229,14 @@ int OSSL_CRMF_MSGS_verify_popo(const OSSL_CRMF_MSGS *reqs, int rid, int acceptRAVerified, OSSL_LIB_CTX *libctx, const char *propq); OSSL_CRMF_CERTTEMPLATE *OSSL_CRMF_MSG_get0_tmpl(const OSSL_CRMF_MSG *crm); -const ASN1_INTEGER -*OSSL_CRMF_CERTTEMPLATE_get0_serialNumber(const OSSL_CRMF_CERTTEMPLATE *tmpl); +X509_PUBKEY +*OSSL_CRMF_CERTTEMPLATE_get0_publicKey(const OSSL_CRMF_CERTTEMPLATE *tmpl); const X509_NAME *OSSL_CRMF_CERTTEMPLATE_get0_subject(const OSSL_CRMF_CERTTEMPLATE *tmpl); const X509_NAME *OSSL_CRMF_CERTTEMPLATE_get0_issuer(const OSSL_CRMF_CERTTEMPLATE *tmpl); +const ASN1_INTEGER +*OSSL_CRMF_CERTTEMPLATE_get0_serialNumber(const OSSL_CRMF_CERTTEMPLATE *tmpl); X509_EXTENSIONS *OSSL_CRMF_CERTTEMPLATE_get0_extensions(const OSSL_CRMF_CERTTEMPLATE *tmpl); const X509_NAME diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/crypto.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/crypto.h index 568a4c4df1a1..ae7e30a26ee1 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/crypto.h +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/crypto.h @@ -2,7 +2,7 @@ * WARNING: do not edit! * Generated by Makefile from include/openssl/crypto.h.in * - * Copyright 1995-2022 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 1995-2024 The OpenSSL Project Authors. All Rights Reserved. * Copyright (c) 2002, Oracle and/or its affiliates. All rights reserved * * Licensed under the Apache License 2.0 (the "License"). You may not use @@ -85,9 +85,15 @@ int CRYPTO_THREAD_unlock(CRYPTO_RWLOCK *lock); void CRYPTO_THREAD_lock_free(CRYPTO_RWLOCK *lock); int CRYPTO_atomic_add(int *val, int amount, int *ret, CRYPTO_RWLOCK *lock); +int CRYPTO_atomic_add64(uint64_t *val, uint64_t op, uint64_t *ret, + CRYPTO_RWLOCK *lock); +int CRYPTO_atomic_and(uint64_t *val, uint64_t op, uint64_t *ret, + CRYPTO_RWLOCK *lock); int CRYPTO_atomic_or(uint64_t *val, uint64_t op, uint64_t *ret, CRYPTO_RWLOCK *lock); int CRYPTO_atomic_load(uint64_t *val, uint64_t *ret, CRYPTO_RWLOCK *lock); +int CRYPTO_atomic_load_int(int *val, int *ret, CRYPTO_RWLOCK *lock); +int CRYPTO_atomic_store(uint64_t *dst, uint64_t val, CRYPTO_RWLOCK *lock); /* No longer needed, so this is a no-op */ #define OPENSSL_malloc_init() while(0) continue @@ -96,6 +102,9 @@ int CRYPTO_atomic_load(uint64_t *val, uint64_t *ret, CRYPTO_RWLOCK *lock); CRYPTO_malloc(num, OPENSSL_FILE, OPENSSL_LINE) # define OPENSSL_zalloc(num) \ CRYPTO_zalloc(num, OPENSSL_FILE, OPENSSL_LINE) +# define OPENSSL_aligned_alloc(num, alignment, freeptr) \ + CRYPTO_aligned_alloc(num, alignment, freeptr, \ + OPENSSL_FILE, OPENSSL_LINE) # define OPENSSL_realloc(addr, num) \ CRYPTO_realloc(addr, num, OPENSSL_FILE, OPENSSL_LINE) # define OPENSSL_clear_realloc(addr, old_num, num) \ @@ -124,6 +133,7 @@ int CRYPTO_atomic_load(uint64_t *val, uint64_t *ret, CRYPTO_RWLOCK *lock); size_t OPENSSL_strlcpy(char *dst, const char *src, size_t siz); size_t OPENSSL_strlcat(char *dst, const char *src, size_t siz); size_t OPENSSL_strnlen(const char *str, size_t maxlen); +int OPENSSL_strtoul(const char *str, char **endptr, int base, unsigned long *num); int OPENSSL_buf2hexstr_ex(char *str, size_t str_n, size_t *strlength, const unsigned char *buf, size_t buflen, const char sep); @@ -160,6 +170,7 @@ const char *OpenSSL_version(int type); # define OPENSSL_FULL_VERSION_STRING 7 # define OPENSSL_MODULES_DIR 8 # define OPENSSL_CPU_INFO 9 +# define OPENSSL_WINCTX 10 const char *OPENSSL_info(int type); /* @@ -174,6 +185,7 @@ const char *OPENSSL_info(int type); # define OPENSSL_INFO_LIST_SEPARATOR 1006 # define OPENSSL_INFO_SEED_SOURCE 1007 # define OPENSSL_INFO_CPU_SETTINGS 1008 +# define OPENSSL_INFO_WINDOWS_CONTEXT 1009 int OPENSSL_issetugid(void); @@ -341,11 +353,14 @@ void CRYPTO_get_mem_functions(CRYPTO_malloc_fn *malloc_fn, CRYPTO_realloc_fn *realloc_fn, CRYPTO_free_fn *free_fn); -void *CRYPTO_malloc(size_t num, const char *file, int line); -void *CRYPTO_zalloc(size_t num, const char *file, int line); -void *CRYPTO_memdup(const void *str, size_t siz, const char *file, int line); -char *CRYPTO_strdup(const char *str, const char *file, int line); -char *CRYPTO_strndup(const char *str, size_t s, const char *file, int line); +OSSL_CRYPTO_ALLOC void *CRYPTO_malloc(size_t num, const char *file, int line); +OSSL_CRYPTO_ALLOC void *CRYPTO_zalloc(size_t num, const char *file, int line); +OSSL_CRYPTO_ALLOC void *CRYPTO_aligned_alloc(size_t num, size_t align, + void **freeptr, const char *file, + int line); +OSSL_CRYPTO_ALLOC void *CRYPTO_memdup(const void *str, size_t siz, const char *file, int line); +OSSL_CRYPTO_ALLOC char *CRYPTO_strdup(const char *str, const char *file, int line); +OSSL_CRYPTO_ALLOC char *CRYPTO_strndup(const char *str, size_t s, const char *file, int line); void CRYPTO_free(void *ptr, const char *file, int line); void CRYPTO_clear_free(void *ptr, size_t num, const char *file, int line); void *CRYPTO_realloc(void *addr, size_t num, const char *file, int line); @@ -354,8 +369,8 @@ void *CRYPTO_clear_realloc(void *addr, size_t old_num, size_t num, int CRYPTO_secure_malloc_init(size_t sz, size_t minsize); int CRYPTO_secure_malloc_done(void); -void *CRYPTO_secure_malloc(size_t num, const char *file, int line); -void *CRYPTO_secure_zalloc(size_t num, const char *file, int line); +OSSL_CRYPTO_ALLOC void *CRYPTO_secure_malloc(size_t num, const char *file, int line); +OSSL_CRYPTO_ALLOC void *CRYPTO_secure_zalloc(size_t num, const char *file, int line); void CRYPTO_secure_free(void *ptr, const char *file, int line); void CRYPTO_secure_clear_free(void *ptr, size_t num, const char *file, int line); @@ -551,6 +566,13 @@ int OSSL_LIB_CTX_load_config(OSSL_LIB_CTX *ctx, const char *config_file); void OSSL_LIB_CTX_free(OSSL_LIB_CTX *); OSSL_LIB_CTX *OSSL_LIB_CTX_get0_global_default(void); OSSL_LIB_CTX *OSSL_LIB_CTX_set0_default(OSSL_LIB_CTX *libctx); +int OSSL_LIB_CTX_get_conf_diagnostics(OSSL_LIB_CTX *ctx); +void OSSL_LIB_CTX_set_conf_diagnostics(OSSL_LIB_CTX *ctx, int value); + +void OSSL_sleep(uint64_t millis); + + +void *OSSL_LIB_CTX_get_data(OSSL_LIB_CTX *ctx, int index); # ifdef __cplusplus } diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/err.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/err.h index b1289ff45e68..8f74f4805b62 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/err.h +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/err.h @@ -1,5 +1,5 @@ /* - * Copyright 1995-2022 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 1995-2023 The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the Apache License 2.0 (the "License"). You may not use * this file except in compliance with the License. You can obtain a copy @@ -372,7 +372,7 @@ typedef struct ERR_string_data_st { } ERR_STRING_DATA; DEFINE_LHASH_OF_INTERNAL(ERR_STRING_DATA); -#define lh_ERR_STRING_DATA_new(hfn, cmp) ((LHASH_OF(ERR_STRING_DATA) *)OPENSSL_LH_new(ossl_check_ERR_STRING_DATA_lh_hashfunc_type(hfn), ossl_check_ERR_STRING_DATA_lh_compfunc_type(cmp))) +#define lh_ERR_STRING_DATA_new(hfn, cmp) ((LHASH_OF(ERR_STRING_DATA) *)OPENSSL_LH_set_thunks(OPENSSL_LH_new(ossl_check_ERR_STRING_DATA_lh_hashfunc_type(hfn), ossl_check_ERR_STRING_DATA_lh_compfunc_type(cmp)), lh_ERR_STRING_DATA_hash_thunk, lh_ERR_STRING_DATA_comp_thunk, lh_ERR_STRING_DATA_doall_thunk, lh_ERR_STRING_DATA_doall_arg_thunk)) #define lh_ERR_STRING_DATA_free(lh) OPENSSL_LH_free(ossl_check_ERR_STRING_DATA_lh_type(lh)) #define lh_ERR_STRING_DATA_flush(lh) OPENSSL_LH_flush(ossl_check_ERR_STRING_DATA_lh_type(lh)) #define lh_ERR_STRING_DATA_insert(lh, ptr) ((ERR_STRING_DATA *)OPENSSL_LH_insert(ossl_check_ERR_STRING_DATA_lh_type(lh), ossl_check_ERR_STRING_DATA_lh_plain_type(ptr))) @@ -496,6 +496,14 @@ int ERR_get_next_error_library(void); int ERR_set_mark(void); int ERR_pop_to_mark(void); int ERR_clear_last_mark(void); +int ERR_count_to_mark(void); +int ERR_pop(void); + +ERR_STATE *OSSL_ERR_STATE_new(void); +void OSSL_ERR_STATE_save(ERR_STATE *es); +void OSSL_ERR_STATE_save_to_mark(ERR_STATE *es); +void OSSL_ERR_STATE_restore(const ERR_STATE *es); +void OSSL_ERR_STATE_free(ERR_STATE *es); #ifdef __cplusplus } diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/fipskey.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/fipskey.h index d4057561e5a0..80ce3fc46284 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/fipskey.h +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/fipskey.h @@ -2,7 +2,7 @@ * WARNING: do not edit! * Generated by Makefile from include/openssl/fipskey.h.in * - * Copyright 2020-2021 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 2020-2024 The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the Apache License 2.0 (the "License"). You may not use * this file except in compliance with the License. You can obtain a copy @@ -29,6 +29,11 @@ extern "C" { */ #define FIPS_KEY_STRING "f4556650ac31d35461610bac4ed81b1a181b2d8a43ea2854cbae22ca74560813" +/* + * The FIPS provider vendor name, as a string. + */ +#define FIPS_VENDOR "OpenSSL non-compliant FIPS Provider" + # ifdef __cplusplus } # endif diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/lhash.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/lhash.h index cbb26384c1f1..93044eec7091 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/lhash.h +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/lhash.h @@ -1,5 +1,5 @@ /* - * Copyright 1995-2021 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 1995-2024 The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the Apache License 2.0 (the "License"). You may not use * this file except in compliance with the License. You can obtain a copy @@ -24,6 +24,9 @@ # include # include +# ifndef OPENSSL_NO_STDIO +# include +# endif #ifdef __cplusplus extern "C" { @@ -31,9 +34,13 @@ extern "C" { typedef struct lhash_node_st OPENSSL_LH_NODE; typedef int (*OPENSSL_LH_COMPFUNC) (const void *, const void *); +typedef int (*OPENSSL_LH_COMPFUNCTHUNK) (const void *, const void *, OPENSSL_LH_COMPFUNC cfn); typedef unsigned long (*OPENSSL_LH_HASHFUNC) (const void *); +typedef unsigned long (*OPENSSL_LH_HASHFUNCTHUNK) (const void *, OPENSSL_LH_HASHFUNC hfn); typedef void (*OPENSSL_LH_DOALL_FUNC) (void *); +typedef void (*OPENSSL_LH_DOALL_FUNC_THUNK) (void *, OPENSSL_LH_DOALL_FUNC doall); typedef void (*OPENSSL_LH_DOALL_FUNCARG) (void *, void *); +typedef void (*OPENSSL_LH_DOALL_FUNCARG_THUNK) (void *, void *, OPENSSL_LH_DOALL_FUNCARG doall); typedef struct lhash_st OPENSSL_LHASH; /* @@ -79,26 +86,40 @@ typedef struct lhash_st OPENSSL_LHASH; int OPENSSL_LH_error(OPENSSL_LHASH *lh); OPENSSL_LHASH *OPENSSL_LH_new(OPENSSL_LH_HASHFUNC h, OPENSSL_LH_COMPFUNC c); +OPENSSL_LHASH *OPENSSL_LH_set_thunks(OPENSSL_LHASH *lh, + OPENSSL_LH_HASHFUNCTHUNK hw, + OPENSSL_LH_COMPFUNCTHUNK cw, + OPENSSL_LH_DOALL_FUNC_THUNK daw, + OPENSSL_LH_DOALL_FUNCARG_THUNK daaw); void OPENSSL_LH_free(OPENSSL_LHASH *lh); void OPENSSL_LH_flush(OPENSSL_LHASH *lh); void *OPENSSL_LH_insert(OPENSSL_LHASH *lh, void *data); void *OPENSSL_LH_delete(OPENSSL_LHASH *lh, const void *data); void *OPENSSL_LH_retrieve(OPENSSL_LHASH *lh, const void *data); void OPENSSL_LH_doall(OPENSSL_LHASH *lh, OPENSSL_LH_DOALL_FUNC func); -void OPENSSL_LH_doall_arg(OPENSSL_LHASH *lh, OPENSSL_LH_DOALL_FUNCARG func, void *arg); +void OPENSSL_LH_doall_arg(OPENSSL_LHASH *lh, + OPENSSL_LH_DOALL_FUNCARG func, void *arg); +void OPENSSL_LH_doall_arg_thunk(OPENSSL_LHASH *lh, + OPENSSL_LH_DOALL_FUNCARG_THUNK daaw, + OPENSSL_LH_DOALL_FUNCARG fn, void *arg); + unsigned long OPENSSL_LH_strhash(const char *c); unsigned long OPENSSL_LH_num_items(const OPENSSL_LHASH *lh); unsigned long OPENSSL_LH_get_down_load(const OPENSSL_LHASH *lh); void OPENSSL_LH_set_down_load(OPENSSL_LHASH *lh, unsigned long down_load); # ifndef OPENSSL_NO_STDIO -void OPENSSL_LH_stats(const OPENSSL_LHASH *lh, FILE *fp); -void OPENSSL_LH_node_stats(const OPENSSL_LHASH *lh, FILE *fp); -void OPENSSL_LH_node_usage_stats(const OPENSSL_LHASH *lh, FILE *fp); +# ifndef OPENSSL_NO_DEPRECATED_3_1 +OSSL_DEPRECATEDIN_3_1 void OPENSSL_LH_stats(const OPENSSL_LHASH *lh, FILE *fp); +OSSL_DEPRECATEDIN_3_1 void OPENSSL_LH_node_stats(const OPENSSL_LHASH *lh, FILE *fp); +OSSL_DEPRECATEDIN_3_1 void OPENSSL_LH_node_usage_stats(const OPENSSL_LHASH *lh, FILE *fp); +# endif +# endif +# ifndef OPENSSL_NO_DEPRECATED_3_1 +OSSL_DEPRECATEDIN_3_1 void OPENSSL_LH_stats_bio(const OPENSSL_LHASH *lh, BIO *out); +OSSL_DEPRECATEDIN_3_1 void OPENSSL_LH_node_stats_bio(const OPENSSL_LHASH *lh, BIO *out); +OSSL_DEPRECATEDIN_3_1 void OPENSSL_LH_node_usage_stats_bio(const OPENSSL_LHASH *lh, BIO *out); # endif -void OPENSSL_LH_stats_bio(const OPENSSL_LHASH *lh, BIO *out); -void OPENSSL_LH_node_stats_bio(const OPENSSL_LHASH *lh, BIO *out); -void OPENSSL_LH_node_usage_stats_bio(const OPENSSL_LHASH *lh, BIO *out); # ifndef OPENSSL_NO_DEPRECATED_1_1_0 # define _LHASH OPENSSL_LHASH @@ -129,110 +150,190 @@ void OPENSSL_LH_node_usage_stats_bio(const OPENSSL_LHASH *lh, BIO *out); /* Helper macro for internal use */ # define DEFINE_LHASH_OF_INTERNAL(type) \ - LHASH_OF(type) { union lh_##type##_dummy { void* d1; unsigned long d2; int d3; } dummy; }; \ + LHASH_OF(type) { \ + union lh_##type##_dummy { void* d1; unsigned long d2; int d3; } dummy; \ + }; \ typedef int (*lh_##type##_compfunc)(const type *a, const type *b); \ typedef unsigned long (*lh_##type##_hashfunc)(const type *a); \ typedef void (*lh_##type##_doallfunc)(type *a); \ - static ossl_unused ossl_inline type *ossl_check_##type##_lh_plain_type(type *ptr) \ + static ossl_inline unsigned long lh_##type##_hash_thunk(const void *data, OPENSSL_LH_HASHFUNC hfn) \ + { \ + unsigned long (*hfn_conv)(const type *) = (unsigned long (*)(const type *))hfn; \ + return hfn_conv((const type *)data); \ + } \ + static ossl_inline int lh_##type##_comp_thunk(const void *da, const void *db, OPENSSL_LH_COMPFUNC cfn) \ + { \ + int (*cfn_conv)(const type *, const type *) = (int (*)(const type *, const type *))cfn; \ + return cfn_conv((const type *)da, (const type *)db); \ + } \ + static ossl_inline void lh_##type##_doall_thunk(void *node, OPENSSL_LH_DOALL_FUNC doall) \ + { \ + void (*doall_conv)(type *) = (void (*)(type *))doall; \ + doall_conv((type *)node); \ + } \ + static ossl_inline void lh_##type##_doall_arg_thunk(void *node, void *arg, OPENSSL_LH_DOALL_FUNCARG doall) \ + { \ + void (*doall_conv)(type *, void *) = (void (*)(type *, void *))doall; \ + doall_conv((type *)node, arg); \ + } \ + static ossl_unused ossl_inline type *\ + ossl_check_##type##_lh_plain_type(type *ptr) \ { \ return ptr; \ } \ - static ossl_unused ossl_inline const type *ossl_check_const_##type##_lh_plain_type(const type *ptr) \ + static ossl_unused ossl_inline const type * \ + ossl_check_const_##type##_lh_plain_type(const type *ptr) \ { \ return ptr; \ } \ - static ossl_unused ossl_inline const OPENSSL_LHASH *ossl_check_const_##type##_lh_type(const LHASH_OF(type) *lh) \ + static ossl_unused ossl_inline const OPENSSL_LHASH * \ + ossl_check_const_##type##_lh_type(const LHASH_OF(type) *lh) \ { \ return (const OPENSSL_LHASH *)lh; \ } \ - static ossl_unused ossl_inline OPENSSL_LHASH *ossl_check_##type##_lh_type(LHASH_OF(type) *lh) \ + static ossl_unused ossl_inline OPENSSL_LHASH * \ + ossl_check_##type##_lh_type(LHASH_OF(type) *lh) \ { \ return (OPENSSL_LHASH *)lh; \ } \ - static ossl_unused ossl_inline OPENSSL_LH_COMPFUNC ossl_check_##type##_lh_compfunc_type(lh_##type##_compfunc cmp) \ + static ossl_unused ossl_inline OPENSSL_LH_COMPFUNC \ + ossl_check_##type##_lh_compfunc_type(lh_##type##_compfunc cmp) \ { \ return (OPENSSL_LH_COMPFUNC)cmp; \ } \ - static ossl_unused ossl_inline OPENSSL_LH_HASHFUNC ossl_check_##type##_lh_hashfunc_type(lh_##type##_hashfunc hfn) \ + static ossl_unused ossl_inline OPENSSL_LH_HASHFUNC \ + ossl_check_##type##_lh_hashfunc_type(lh_##type##_hashfunc hfn) \ { \ return (OPENSSL_LH_HASHFUNC)hfn; \ } \ - static ossl_unused ossl_inline OPENSSL_LH_DOALL_FUNC ossl_check_##type##_lh_doallfunc_type(lh_##type##_doallfunc dfn) \ + static ossl_unused ossl_inline OPENSSL_LH_DOALL_FUNC \ + ossl_check_##type##_lh_doallfunc_type(lh_##type##_doallfunc dfn) \ { \ return (OPENSSL_LH_DOALL_FUNC)dfn; \ } \ LHASH_OF(type) -# define DEFINE_LHASH_OF(type) \ - LHASH_OF(type) { union lh_##type##_dummy { void* d1; unsigned long d2; int d3; } dummy; }; \ - static ossl_unused ossl_inline LHASH_OF(type) *lh_##type##_new(unsigned long (*hfn)(const type *), \ - int (*cfn)(const type *, const type *)) \ +# ifndef OPENSSL_NO_DEPRECATED_3_1 +# define DEFINE_LHASH_OF_DEPRECATED(type) \ + static ossl_unused ossl_inline void \ + lh_##type##_node_stats_bio(const LHASH_OF(type) *lh, BIO *out) \ { \ - return (LHASH_OF(type) *) \ - OPENSSL_LH_new((OPENSSL_LH_HASHFUNC)hfn, (OPENSSL_LH_COMPFUNC)cfn); \ + OPENSSL_LH_node_stats_bio((const OPENSSL_LHASH *)lh, out); \ } \ - static ossl_unused ossl_inline void lh_##type##_free(LHASH_OF(type) *lh) \ + static ossl_unused ossl_inline void \ + lh_##type##_node_usage_stats_bio(const LHASH_OF(type) *lh, BIO *out) \ + { \ + OPENSSL_LH_node_usage_stats_bio((const OPENSSL_LHASH *)lh, out); \ + } \ + static ossl_unused ossl_inline void \ + lh_##type##_stats_bio(const LHASH_OF(type) *lh, BIO *out) \ + { \ + OPENSSL_LH_stats_bio((const OPENSSL_LHASH *)lh, out); \ + } +# else +# define DEFINE_LHASH_OF_DEPRECATED(type) +# endif + +# define DEFINE_LHASH_OF_EX(type) \ + LHASH_OF(type) { \ + union lh_##type##_dummy { void* d1; unsigned long d2; int d3; } dummy; \ + }; \ + static unsigned long \ + lh_##type##_hfn_thunk(const void *data, OPENSSL_LH_HASHFUNC hfn) \ + { \ + unsigned long (*hfn_conv)(const type *) = (unsigned long (*)(const type *))hfn; \ + return hfn_conv((const type *)data); \ + } \ + static int lh_##type##_cfn_thunk(const void *da, const void *db, OPENSSL_LH_COMPFUNC cfn) \ + { \ + int (*cfn_conv)(const type *, const type *) = (int (*)(const type *, const type *))cfn; \ + return cfn_conv((const type *)da, (const type *)db); \ + } \ + static ossl_unused ossl_inline void \ + lh_##type##_free(LHASH_OF(type) *lh) \ { \ OPENSSL_LH_free((OPENSSL_LHASH *)lh); \ } \ - static ossl_unused ossl_inline void lh_##type##_flush(LHASH_OF(type) *lh) \ + static ossl_unused ossl_inline void \ + lh_##type##_flush(LHASH_OF(type) *lh) \ { \ OPENSSL_LH_flush((OPENSSL_LHASH *)lh); \ } \ - static ossl_unused ossl_inline type *lh_##type##_insert(LHASH_OF(type) *lh, type *d) \ + static ossl_unused ossl_inline type * \ + lh_##type##_insert(LHASH_OF(type) *lh, type *d) \ { \ return (type *)OPENSSL_LH_insert((OPENSSL_LHASH *)lh, d); \ } \ - static ossl_unused ossl_inline type *lh_##type##_delete(LHASH_OF(type) *lh, const type *d) \ + static ossl_unused ossl_inline type * \ + lh_##type##_delete(LHASH_OF(type) *lh, const type *d) \ { \ return (type *)OPENSSL_LH_delete((OPENSSL_LHASH *)lh, d); \ } \ - static ossl_unused ossl_inline type *lh_##type##_retrieve(LHASH_OF(type) *lh, const type *d) \ + static ossl_unused ossl_inline type * \ + lh_##type##_retrieve(LHASH_OF(type) *lh, const type *d) \ { \ return (type *)OPENSSL_LH_retrieve((OPENSSL_LHASH *)lh, d); \ } \ - static ossl_unused ossl_inline int lh_##type##_error(LHASH_OF(type) *lh) \ + static ossl_unused ossl_inline int \ + lh_##type##_error(LHASH_OF(type) *lh) \ { \ return OPENSSL_LH_error((OPENSSL_LHASH *)lh); \ } \ - static ossl_unused ossl_inline unsigned long lh_##type##_num_items(LHASH_OF(type) *lh) \ + static ossl_unused ossl_inline unsigned long \ + lh_##type##_num_items(LHASH_OF(type) *lh) \ { \ return OPENSSL_LH_num_items((OPENSSL_LHASH *)lh); \ } \ - static ossl_unused ossl_inline void lh_##type##_node_stats_bio(const LHASH_OF(type) *lh, BIO *out) \ + static ossl_unused ossl_inline unsigned long \ + lh_##type##_get_down_load(LHASH_OF(type) *lh) \ { \ - OPENSSL_LH_node_stats_bio((const OPENSSL_LHASH *)lh, out); \ + return OPENSSL_LH_get_down_load((OPENSSL_LHASH *)lh); \ } \ - static ossl_unused ossl_inline void lh_##type##_node_usage_stats_bio(const LHASH_OF(type) *lh, BIO *out) \ + static ossl_unused ossl_inline void \ + lh_##type##_set_down_load(LHASH_OF(type) *lh, unsigned long dl) \ { \ - OPENSSL_LH_node_usage_stats_bio((const OPENSSL_LHASH *)lh, out); \ + OPENSSL_LH_set_down_load((OPENSSL_LHASH *)lh, dl); \ } \ - static ossl_unused ossl_inline void lh_##type##_stats_bio(const LHASH_OF(type) *lh, BIO *out) \ + static ossl_unused ossl_inline void \ + lh_##type##_doall_thunk(void *node, OPENSSL_LH_DOALL_FUNC doall) \ { \ - OPENSSL_LH_stats_bio((const OPENSSL_LHASH *)lh, out); \ + void (*doall_conv)(type *) = (void (*)(type *))doall; \ + doall_conv((type *)node); \ } \ - static ossl_unused ossl_inline unsigned long lh_##type##_get_down_load(LHASH_OF(type) *lh) \ + static ossl_unused ossl_inline void \ + lh_##type##_doall_arg_thunk(void *node, void *arg, OPENSSL_LH_DOALL_FUNCARG doall) \ { \ - return OPENSSL_LH_get_down_load((OPENSSL_LHASH *)lh); \ + void (*doall_conv)(type *, void *) = (void (*)(type *, void *))doall; \ + doall_conv((type *)node, arg); \ } \ - static ossl_unused ossl_inline void lh_##type##_set_down_load(LHASH_OF(type) *lh, unsigned long dl) \ + static ossl_unused ossl_inline void \ + lh_##type##_doall(LHASH_OF(type) *lh, void (*doall)(type *)) \ { \ - OPENSSL_LH_set_down_load((OPENSSL_LHASH *)lh, dl); \ + OPENSSL_LH_doall((OPENSSL_LHASH *)lh, (OPENSSL_LH_DOALL_FUNC)doall); \ } \ - static ossl_unused ossl_inline void lh_##type##_doall(LHASH_OF(type) *lh, \ - void (*doall)(type *)) \ + static ossl_unused ossl_inline LHASH_OF(type) * \ + lh_##type##_new(unsigned long (*hfn)(const type *), \ + int (*cfn)(const type *, const type *)) \ { \ - OPENSSL_LH_doall((OPENSSL_LHASH *)lh, (OPENSSL_LH_DOALL_FUNC)doall); \ + return (LHASH_OF(type) *)OPENSSL_LH_set_thunks(OPENSSL_LH_new((OPENSSL_LH_HASHFUNC)hfn, (OPENSSL_LH_COMPFUNC)cfn), \ + lh_##type##_hfn_thunk, lh_##type##_cfn_thunk, \ + lh_##type##_doall_thunk, \ + lh_##type##_doall_arg_thunk); \ } \ - static ossl_unused ossl_inline void lh_##type##_doall_arg(LHASH_OF(type) *lh, \ - void (*doallarg)(type *, void *), \ - void *arg) \ + static ossl_unused ossl_inline void \ + lh_##type##_doall_arg(LHASH_OF(type) *lh, \ + void (*doallarg)(type *, void *), void *arg) \ { \ OPENSSL_LH_doall_arg((OPENSSL_LHASH *)lh, \ (OPENSSL_LH_DOALL_FUNCARG)doallarg, arg); \ } \ LHASH_OF(type) +# define DEFINE_LHASH_OF(type) \ + DEFINE_LHASH_OF_EX(type); \ + DEFINE_LHASH_OF_DEPRECATED(type) \ + LHASH_OF(type) + #define IMPLEMENT_LHASH_DOALL_ARG_CONST(type, argtype) \ int_implement_lhash_doall(type, argtype, const type) @@ -240,17 +341,26 @@ void OPENSSL_LH_node_usage_stats_bio(const OPENSSL_LHASH *lh, BIO *out); int_implement_lhash_doall(type, argtype, type) #define int_implement_lhash_doall(type, argtype, cbargtype) \ + static ossl_unused ossl_inline void \ + lh_##type##_doall_##argtype##_thunk(void *node, void *arg, OPENSSL_LH_DOALL_FUNCARG fn) \ + { \ + void (*fn_conv)(cbargtype *, argtype *) = (void (*)(cbargtype *, argtype *))fn; \ + fn_conv((cbargtype *)node, (argtype *)arg); \ + } \ static ossl_unused ossl_inline void \ lh_##type##_doall_##argtype(LHASH_OF(type) *lh, \ void (*fn)(cbargtype *, argtype *), \ argtype *arg) \ { \ - OPENSSL_LH_doall_arg((OPENSSL_LHASH *)lh, (OPENSSL_LH_DOALL_FUNCARG)fn, (void *)arg); \ + OPENSSL_LH_doall_arg_thunk((OPENSSL_LHASH *)lh, \ + lh_##type##_doall_##argtype##_thunk, \ + (OPENSSL_LH_DOALL_FUNCARG)fn, \ + (void *)arg); \ } \ LHASH_OF(type) DEFINE_LHASH_OF_INTERNAL(OPENSSL_STRING); -#define lh_OPENSSL_STRING_new(hfn, cmp) ((LHASH_OF(OPENSSL_STRING) *)OPENSSL_LH_new(ossl_check_OPENSSL_STRING_lh_hashfunc_type(hfn), ossl_check_OPENSSL_STRING_lh_compfunc_type(cmp))) +#define lh_OPENSSL_STRING_new(hfn, cmp) ((LHASH_OF(OPENSSL_STRING) *)OPENSSL_LH_set_thunks(OPENSSL_LH_new(ossl_check_OPENSSL_STRING_lh_hashfunc_type(hfn), ossl_check_OPENSSL_STRING_lh_compfunc_type(cmp)), lh_OPENSSL_STRING_hash_thunk, lh_OPENSSL_STRING_comp_thunk, lh_OPENSSL_STRING_doall_thunk, lh_OPENSSL_STRING_doall_arg_thunk)) #define lh_OPENSSL_STRING_free(lh) OPENSSL_LH_free(ossl_check_OPENSSL_STRING_lh_type(lh)) #define lh_OPENSSL_STRING_flush(lh) OPENSSL_LH_flush(ossl_check_OPENSSL_STRING_lh_type(lh)) #define lh_OPENSSL_STRING_insert(lh, ptr) ((OPENSSL_STRING *)OPENSSL_LH_insert(ossl_check_OPENSSL_STRING_lh_type(lh), ossl_check_OPENSSL_STRING_lh_plain_type(ptr))) @@ -265,7 +375,7 @@ DEFINE_LHASH_OF_INTERNAL(OPENSSL_STRING); #define lh_OPENSSL_STRING_set_down_load(lh, dl) OPENSSL_LH_set_down_load(ossl_check_OPENSSL_STRING_lh_type(lh), dl) #define lh_OPENSSL_STRING_doall(lh, dfn) OPENSSL_LH_doall(ossl_check_OPENSSL_STRING_lh_type(lh), ossl_check_OPENSSL_STRING_lh_doallfunc_type(dfn)) DEFINE_LHASH_OF_INTERNAL(OPENSSL_CSTRING); -#define lh_OPENSSL_CSTRING_new(hfn, cmp) ((LHASH_OF(OPENSSL_CSTRING) *)OPENSSL_LH_new(ossl_check_OPENSSL_CSTRING_lh_hashfunc_type(hfn), ossl_check_OPENSSL_CSTRING_lh_compfunc_type(cmp))) +#define lh_OPENSSL_CSTRING_new(hfn, cmp) ((LHASH_OF(OPENSSL_CSTRING) *)OPENSSL_LH_set_thunks(OPENSSL_LH_new(ossl_check_OPENSSL_CSTRING_lh_hashfunc_type(hfn), ossl_check_OPENSSL_CSTRING_lh_compfunc_type(cmp)), lh_OPENSSL_CSTRING_hash_thunk, lh_OPENSSL_CSTRING_comp_thunk, lh_OPENSSL_CSTRING_doall_thunk, lh_OPENSSL_CSTRING_doall_arg_thunk)) #define lh_OPENSSL_CSTRING_free(lh) OPENSSL_LH_free(ossl_check_OPENSSL_CSTRING_lh_type(lh)) #define lh_OPENSSL_CSTRING_flush(lh) OPENSSL_LH_flush(ossl_check_OPENSSL_CSTRING_lh_type(lh)) #define lh_OPENSSL_CSTRING_insert(lh, ptr) ((OPENSSL_CSTRING *)OPENSSL_LH_insert(ossl_check_OPENSSL_CSTRING_lh_type(lh), ossl_check_OPENSSL_CSTRING_lh_plain_type(ptr))) diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/opensslv.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/opensslv.h index efbc858cdd51..a096ca90bacb 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/opensslv.h +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/opensslv.h @@ -28,8 +28,8 @@ extern "C" { * These macros express version number MAJOR.MINOR.PATCH exactly */ # define OPENSSL_VERSION_MAJOR 3 -# define OPENSSL_VERSION_MINOR 0 -# define OPENSSL_VERSION_PATCH 15 +# define OPENSSL_VERSION_MINOR 4 +# define OPENSSL_VERSION_PATCH 0 /* * Additional version information @@ -74,21 +74,21 @@ extern "C" { * longer variant with OPENSSL_VERSION_PRE_RELEASE_STR and * OPENSSL_VERSION_BUILD_METADATA_STR appended. */ -# define OPENSSL_VERSION_STR "3.0.15" -# define OPENSSL_FULL_VERSION_STR "3.0.15" +# define OPENSSL_VERSION_STR "3.4.0" +# define OPENSSL_FULL_VERSION_STR "3.4.0" /* * SECTION 3: ADDITIONAL METADATA * * These strings are defined separately to allow them to be parsable. */ -# define OPENSSL_RELEASE_DATE "3 Sep 2024" +# define OPENSSL_RELEASE_DATE "22 Oct 2024" /* * SECTION 4: BACKWARD COMPATIBILITY */ -# define OPENSSL_VERSION_TEXT "OpenSSL 3.0.15 3 Sep 2024" +# define OPENSSL_VERSION_TEXT "OpenSSL 3.4.0 22 Oct 2024" /* Synthesize OPENSSL_VERSION_NUMBER with the layout 0xMNN00PPSL */ # ifdef OPENSSL_VERSION_PRE_RELEASE diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/pkcs12.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/pkcs12.h index a11ca80ef75d..40e26b451529 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/pkcs12.h +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/pkcs12.h @@ -2,7 +2,7 @@ * WARNING: do not edit! * Generated by Makefile from include/openssl/pkcs12.h.in * - * Copyright 1999-2021 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 1999-2024 The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the Apache License 2.0 (the "License"). You may not use * this file except in compliance with the License. You can obtain a copy @@ -25,6 +25,9 @@ # include # include # include +# ifndef OPENSSL_NO_STDIO +# include +# endif #ifdef __cplusplus extern "C" { @@ -41,6 +44,7 @@ extern "C" { # define PKCS12_MAC_KEY_LENGTH 20 +/* The macro is expected to be used only internally. Kept for backwards compatibility. */ # define PKCS12_SALT_LEN 8 /* It's not clear if these are actually needed... */ @@ -130,7 +134,9 @@ int PKCS12_SAFEBAG_get_bag_nid(const PKCS12_SAFEBAG *bag); const ASN1_TYPE *PKCS12_SAFEBAG_get0_bag_obj(const PKCS12_SAFEBAG *bag); const ASN1_OBJECT *PKCS12_SAFEBAG_get0_bag_type(const PKCS12_SAFEBAG *bag); +X509 *PKCS12_SAFEBAG_get1_cert_ex(const PKCS12_SAFEBAG *bag, OSSL_LIB_CTX *libctx, const char *propq); X509 *PKCS12_SAFEBAG_get1_cert(const PKCS12_SAFEBAG *bag); +X509_CRL *PKCS12_SAFEBAG_get1_crl_ex(const PKCS12_SAFEBAG *bag, OSSL_LIB_CTX *libctx, const char *propq); X509_CRL *PKCS12_SAFEBAG_get1_crl(const PKCS12_SAFEBAG *bag); const STACK_OF(PKCS12_SAFEBAG) * PKCS12_SAFEBAG_get0_safes(const PKCS12_SAFEBAG *bag); @@ -218,6 +224,7 @@ ASN1_TYPE *PKCS12_get_attr_gen(const STACK_OF(X509_ATTRIBUTE) *attrs, char *PKCS12_get_friendlyname(PKCS12_SAFEBAG *bag); const STACK_OF(X509_ATTRIBUTE) * PKCS12_SAFEBAG_get0_attrs(const PKCS12_SAFEBAG *bag); +void PKCS12_SAFEBAG_set0_attrs(PKCS12_SAFEBAG *bag, STACK_OF(X509_ATTRIBUTE) *attrs); unsigned char *PKCS12_pbe_crypt(const X509_ALGOR *algor, const char *pass, int passlen, const unsigned char *in, int inlen, @@ -285,6 +292,9 @@ int PKCS12_verify_mac(PKCS12 *p12, const char *pass, int passlen); int PKCS12_set_mac(PKCS12 *p12, const char *pass, int passlen, unsigned char *salt, int saltlen, int iter, const EVP_MD *md_type); +int PKCS12_set_pbmac1_pbkdf2(PKCS12 *p12, const char *pass, int passlen, + unsigned char *salt, int saltlen, int iter, + const EVP_MD *md_type, const char *prf_md_name); int PKCS12_setup_mac(PKCS12 *p12, int iter, unsigned char *salt, int saltlen, const EVP_MD *md_type); unsigned char *OPENSSL_asc2uni(const char *asc, int asclen, @@ -305,6 +315,7 @@ DECLARE_ASN1_ITEM(PKCS12_AUTHSAFES) void PKCS12_PBE_add(void); int PKCS12_parse(PKCS12 *p12, const char *pass, EVP_PKEY **pkey, X509 **cert, STACK_OF(X509) **ca); +typedef int PKCS12_create_cb(PKCS12_SAFEBAG *bag, void *cbarg); PKCS12 *PKCS12_create(const char *pass, const char *name, EVP_PKEY *pkey, X509 *cert, STACK_OF(X509) *ca, int nid_key, int nid_cert, int iter, int mac_iter, int keytype); @@ -312,6 +323,11 @@ PKCS12 *PKCS12_create_ex(const char *pass, const char *name, EVP_PKEY *pkey, X509 *cert, STACK_OF(X509) *ca, int nid_key, int nid_cert, int iter, int mac_iter, int keytype, OSSL_LIB_CTX *ctx, const char *propq); +PKCS12 *PKCS12_create_ex2(const char *pass, const char *name, EVP_PKEY *pkey, + X509 *cert, STACK_OF(X509) *ca, int nid_key, int nid_cert, + int iter, int mac_iter, int keytype, + OSSL_LIB_CTX *ctx, const char *propq, + PKCS12_create_cb *cb, void *cbarg); PKCS12_SAFEBAG *PKCS12_add_cert(STACK_OF(PKCS12_SAFEBAG) **pbags, X509 *cert); PKCS12_SAFEBAG *PKCS12_add_key(STACK_OF(PKCS12_SAFEBAG) **pbags, diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/pkcs7.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/pkcs7.h index 497fcf379226..91239d1be4ac 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/pkcs7.h +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/pkcs7.h @@ -2,7 +2,7 @@ * WARNING: do not edit! * Generated by Makefile from include/openssl/pkcs7.h.in * - * Copyright 1995-2023 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 1995-2021 The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the Apache License 2.0 (the "License"). You may not use * this file except in compliance with the License. You can obtain a copy @@ -28,6 +28,9 @@ # include # include # include +# ifndef OPENSSL_NO_STDIO +# include +# endif #ifdef __cplusplus extern "C" { @@ -131,8 +134,8 @@ SKM_DEFINE_STACK_OF_INTERNAL(PKCS7_RECIP_INFO, PKCS7_RECIP_INFO, PKCS7_RECIP_INF typedef struct pkcs7_signed_st { ASN1_INTEGER *version; /* version 1 */ STACK_OF(X509_ALGOR) *md_algs; /* md used */ - STACK_OF(X509) *cert; /* [ 0 ] */ - STACK_OF(X509_CRL) *crl; /* [ 1 ] */ + STACK_OF(X509) *cert; /* [ 0 ] */ /* name should be 'certificates' */ + STACK_OF(X509_CRL) *crl; /* [ 1 ] */ /* name should be 'crls' */ STACK_OF(PKCS7_SIGNER_INFO) *signer_info; struct pkcs7_st *contents; } PKCS7_SIGNED; @@ -158,8 +161,8 @@ typedef struct pkcs7_enveloped_st { typedef struct pkcs7_signedandenveloped_st { ASN1_INTEGER *version; /* version 1 */ STACK_OF(X509_ALGOR) *md_algs; /* md used */ - STACK_OF(X509) *cert; /* [ 0 ] */ - STACK_OF(X509_CRL) *crl; /* [ 1 ] */ + STACK_OF(X509) *cert; /* [ 0 ] */ /* name should be 'certificates' */ + STACK_OF(X509_CRL) *crl; /* [ 1 ] */ /* name should be 'crls' */ STACK_OF(PKCS7_SIGNER_INFO) *signer_info; PKCS7_ENC_CONTENT *enc_data; STACK_OF(PKCS7_RECIP_INFO) *recipientinfo; @@ -200,7 +203,7 @@ typedef struct pkcs7_st { /* NID_pkcs7_data */ ASN1_OCTET_STRING *data; /* NID_pkcs7_signed */ - PKCS7_SIGNED *sign; + PKCS7_SIGNED *sign; /* field name 'signed' would clash with C keyword */ /* NID_pkcs7_enveloped */ PKCS7_ENVELOPE *enveloped; /* NID_pkcs7_signedAndEnveloped */ @@ -341,13 +344,13 @@ int PKCS7_SIGNER_INFO_set(PKCS7_SIGNER_INFO *p7i, X509 *x509, EVP_PKEY *pkey, const EVP_MD *dgst); int PKCS7_SIGNER_INFO_sign(PKCS7_SIGNER_INFO *si); int PKCS7_add_signer(PKCS7 *p7, PKCS7_SIGNER_INFO *p7i); -int PKCS7_add_certificate(PKCS7 *p7, X509 *x509); -int PKCS7_add_crl(PKCS7 *p7, X509_CRL *x509); +int PKCS7_add_certificate(PKCS7 *p7, X509 *cert); +int PKCS7_add_crl(PKCS7 *p7, X509_CRL *crl); int PKCS7_content_new(PKCS7 *p7, int nid); int PKCS7_dataVerify(X509_STORE *cert_store, X509_STORE_CTX *ctx, BIO *bio, PKCS7 *p7, PKCS7_SIGNER_INFO *si); int PKCS7_signatureVerify(BIO *bio, PKCS7 *p7, PKCS7_SIGNER_INFO *si, - X509 *x509); + X509 *signer); BIO *PKCS7_dataInit(PKCS7 *p7, BIO *bio); int PKCS7_dataFinal(PKCS7 *p7, BIO *bio); diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/ssl.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/ssl.h index fb5939dbc8c0..9741f3a18a58 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/ssl.h +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/ssl.h @@ -2,7 +2,7 @@ * WARNING: do not edit! * Generated by Makefile from include/openssl/ssl.h.in * - * Copyright 1995-2022 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 1995-2024 The OpenSSL Project Authors. All Rights Reserved. * Copyright (c) 2002, Oracle and/or its affiliates. All rights reserved * Copyright 2005 Nokia. All rights reserved. * @@ -24,6 +24,7 @@ # endif # include +# include # include # include # include @@ -42,6 +43,9 @@ # include # include # include +# ifndef OPENSSL_NO_STDIO +# include +# endif #ifdef __cplusplus extern "C" { @@ -231,10 +235,8 @@ typedef struct ssl_cipher_st SSL_CIPHER; typedef struct ssl_session_st SSL_SESSION; typedef struct tls_sigalgs_st TLS_SIGALGS; typedef struct ssl_conf_ctx_st SSL_CONF_CTX; -typedef struct ssl_comp_st SSL_COMP; STACK_OF(SSL_CIPHER); -STACK_OF(SSL_COMP); /* SRTP protection profiles for use with the use_srtp extension (RFC 5764)*/ typedef struct srtp_protection_profile_st { @@ -278,28 +280,31 @@ typedef int (*tls_session_secret_cb_fn)(SSL *s, void *secret, int *secret_len, /* Extension context codes */ /* This extension is only allowed in TLS */ -#define SSL_EXT_TLS_ONLY 0x0001 +#define SSL_EXT_TLS_ONLY 0x00001 /* This extension is only allowed in DTLS */ -#define SSL_EXT_DTLS_ONLY 0x0002 +#define SSL_EXT_DTLS_ONLY 0x00002 /* Some extensions may be allowed in DTLS but we don't implement them for it */ -#define SSL_EXT_TLS_IMPLEMENTATION_ONLY 0x0004 +#define SSL_EXT_TLS_IMPLEMENTATION_ONLY 0x00004 /* Most extensions are not defined for SSLv3 but EXT_TYPE_renegotiate is */ -#define SSL_EXT_SSL3_ALLOWED 0x0008 +#define SSL_EXT_SSL3_ALLOWED 0x00008 /* Extension is only defined for TLS1.2 and below */ -#define SSL_EXT_TLS1_2_AND_BELOW_ONLY 0x0010 +#define SSL_EXT_TLS1_2_AND_BELOW_ONLY 0x00010 /* Extension is only defined for TLS1.3 and above */ -#define SSL_EXT_TLS1_3_ONLY 0x0020 +#define SSL_EXT_TLS1_3_ONLY 0x00020 /* Ignore this extension during parsing if we are resuming */ -#define SSL_EXT_IGNORE_ON_RESUMPTION 0x0040 -#define SSL_EXT_CLIENT_HELLO 0x0080 +#define SSL_EXT_IGNORE_ON_RESUMPTION 0x00040 +#define SSL_EXT_CLIENT_HELLO 0x00080 /* Really means TLS1.2 or below */ -#define SSL_EXT_TLS1_2_SERVER_HELLO 0x0100 -#define SSL_EXT_TLS1_3_SERVER_HELLO 0x0200 -#define SSL_EXT_TLS1_3_ENCRYPTED_EXTENSIONS 0x0400 -#define SSL_EXT_TLS1_3_HELLO_RETRY_REQUEST 0x0800 -#define SSL_EXT_TLS1_3_CERTIFICATE 0x1000 -#define SSL_EXT_TLS1_3_NEW_SESSION_TICKET 0x2000 -#define SSL_EXT_TLS1_3_CERTIFICATE_REQUEST 0x4000 +#define SSL_EXT_TLS1_2_SERVER_HELLO 0x00100 +#define SSL_EXT_TLS1_3_SERVER_HELLO 0x00200 +#define SSL_EXT_TLS1_3_ENCRYPTED_EXTENSIONS 0x00400 +#define SSL_EXT_TLS1_3_HELLO_RETRY_REQUEST 0x00800 +#define SSL_EXT_TLS1_3_CERTIFICATE 0x01000 +#define SSL_EXT_TLS1_3_NEW_SESSION_TICKET 0x02000 +#define SSL_EXT_TLS1_3_CERTIFICATE_REQUEST 0x04000 +#define SSL_EXT_TLS1_3_CERTIFICATE_COMPRESSION 0x08000 +/* When sending a raw public key in a certificate message */ +#define SSL_EXT_TLS1_3_RAW_PUBLIC_KEY 0x10000 /* Typedefs for handling custom extensions */ @@ -430,6 +435,19 @@ typedef int (*SSL_async_callback_fn)(SSL *s, void *arg); * interoperability with CryptoPro CSP 3.x */ # define SSL_OP_CRYPTOPRO_TLSEXT_BUG SSL_OP_BIT(31) +/* + * Disable RFC8879 certificate compression + * SSL_OP_NO_TX_CERTIFICATE_COMPRESSION: don't send compressed certificates, + * and ignore the extension when received. + * SSL_OP_NO_RX_CERTIFICATE_COMPRESSION: don't send the extension, and + * subsequently indicating that receiving is not supported + */ +# define SSL_OP_NO_TX_CERTIFICATE_COMPRESSION SSL_OP_BIT(32) +# define SSL_OP_NO_RX_CERTIFICATE_COMPRESSION SSL_OP_BIT(33) + /* Enable KTLS TX zerocopy on Linux */ +# define SSL_OP_ENABLE_KTLS_TX_ZEROCOPY_SENDFILE SSL_OP_BIT(34) + +#define SSL_OP_PREFER_NO_DHE_KEX SSL_OP_BIT(35) /* * Option "collections." @@ -574,6 +592,8 @@ typedef int (*SSL_async_callback_fn)(SSL *s, void *arg); # define CERT_PKEY_CERT_TYPE 0x400 /* Cert chain suitable to Suite B */ # define CERT_PKEY_SUITEB 0x800 +/* Cert pkey valid for raw public key use */ +# define CERT_PKEY_RPK 0x1000 # define SSL_CONF_FLAG_CMDLINE 0x1 # define SSL_CONF_FLAG_FILE 0x2 @@ -965,6 +985,7 @@ uint32_t SSL_get_recv_max_early_data(const SSL *s); # include /* This is mostly sslv3 with a few tweaks */ # include /* Datagram TLS */ # include /* Support for the use_srtp extension */ +# include #ifdef __cplusplus extern "C" { @@ -1000,32 +1021,6 @@ SKM_DEFINE_STACK_OF_INTERNAL(SSL_CIPHER, const SSL_CIPHER, SSL_CIPHER) #define sk_SSL_CIPHER_dup(sk) ((STACK_OF(SSL_CIPHER) *)OPENSSL_sk_dup(ossl_check_const_SSL_CIPHER_sk_type(sk))) #define sk_SSL_CIPHER_deep_copy(sk, copyfunc, freefunc) ((STACK_OF(SSL_CIPHER) *)OPENSSL_sk_deep_copy(ossl_check_const_SSL_CIPHER_sk_type(sk), ossl_check_SSL_CIPHER_copyfunc_type(copyfunc), ossl_check_SSL_CIPHER_freefunc_type(freefunc))) #define sk_SSL_CIPHER_set_cmp_func(sk, cmp) ((sk_SSL_CIPHER_compfunc)OPENSSL_sk_set_cmp_func(ossl_check_SSL_CIPHER_sk_type(sk), ossl_check_SSL_CIPHER_compfunc_type(cmp))) -SKM_DEFINE_STACK_OF_INTERNAL(SSL_COMP, SSL_COMP, SSL_COMP) -#define sk_SSL_COMP_num(sk) OPENSSL_sk_num(ossl_check_const_SSL_COMP_sk_type(sk)) -#define sk_SSL_COMP_value(sk, idx) ((SSL_COMP *)OPENSSL_sk_value(ossl_check_const_SSL_COMP_sk_type(sk), (idx))) -#define sk_SSL_COMP_new(cmp) ((STACK_OF(SSL_COMP) *)OPENSSL_sk_new(ossl_check_SSL_COMP_compfunc_type(cmp))) -#define sk_SSL_COMP_new_null() ((STACK_OF(SSL_COMP) *)OPENSSL_sk_new_null()) -#define sk_SSL_COMP_new_reserve(cmp, n) ((STACK_OF(SSL_COMP) *)OPENSSL_sk_new_reserve(ossl_check_SSL_COMP_compfunc_type(cmp), (n))) -#define sk_SSL_COMP_reserve(sk, n) OPENSSL_sk_reserve(ossl_check_SSL_COMP_sk_type(sk), (n)) -#define sk_SSL_COMP_free(sk) OPENSSL_sk_free(ossl_check_SSL_COMP_sk_type(sk)) -#define sk_SSL_COMP_zero(sk) OPENSSL_sk_zero(ossl_check_SSL_COMP_sk_type(sk)) -#define sk_SSL_COMP_delete(sk, i) ((SSL_COMP *)OPENSSL_sk_delete(ossl_check_SSL_COMP_sk_type(sk), (i))) -#define sk_SSL_COMP_delete_ptr(sk, ptr) ((SSL_COMP *)OPENSSL_sk_delete_ptr(ossl_check_SSL_COMP_sk_type(sk), ossl_check_SSL_COMP_type(ptr))) -#define sk_SSL_COMP_push(sk, ptr) OPENSSL_sk_push(ossl_check_SSL_COMP_sk_type(sk), ossl_check_SSL_COMP_type(ptr)) -#define sk_SSL_COMP_unshift(sk, ptr) OPENSSL_sk_unshift(ossl_check_SSL_COMP_sk_type(sk), ossl_check_SSL_COMP_type(ptr)) -#define sk_SSL_COMP_pop(sk) ((SSL_COMP *)OPENSSL_sk_pop(ossl_check_SSL_COMP_sk_type(sk))) -#define sk_SSL_COMP_shift(sk) ((SSL_COMP *)OPENSSL_sk_shift(ossl_check_SSL_COMP_sk_type(sk))) -#define sk_SSL_COMP_pop_free(sk, freefunc) OPENSSL_sk_pop_free(ossl_check_SSL_COMP_sk_type(sk),ossl_check_SSL_COMP_freefunc_type(freefunc)) -#define sk_SSL_COMP_insert(sk, ptr, idx) OPENSSL_sk_insert(ossl_check_SSL_COMP_sk_type(sk), ossl_check_SSL_COMP_type(ptr), (idx)) -#define sk_SSL_COMP_set(sk, idx, ptr) ((SSL_COMP *)OPENSSL_sk_set(ossl_check_SSL_COMP_sk_type(sk), (idx), ossl_check_SSL_COMP_type(ptr))) -#define sk_SSL_COMP_find(sk, ptr) OPENSSL_sk_find(ossl_check_SSL_COMP_sk_type(sk), ossl_check_SSL_COMP_type(ptr)) -#define sk_SSL_COMP_find_ex(sk, ptr) OPENSSL_sk_find_ex(ossl_check_SSL_COMP_sk_type(sk), ossl_check_SSL_COMP_type(ptr)) -#define sk_SSL_COMP_find_all(sk, ptr, pnum) OPENSSL_sk_find_all(ossl_check_SSL_COMP_sk_type(sk), ossl_check_SSL_COMP_type(ptr), pnum) -#define sk_SSL_COMP_sort(sk) OPENSSL_sk_sort(ossl_check_SSL_COMP_sk_type(sk)) -#define sk_SSL_COMP_is_sorted(sk) OPENSSL_sk_is_sorted(ossl_check_const_SSL_COMP_sk_type(sk)) -#define sk_SSL_COMP_dup(sk) ((STACK_OF(SSL_COMP) *)OPENSSL_sk_dup(ossl_check_const_SSL_COMP_sk_type(sk))) -#define sk_SSL_COMP_deep_copy(sk, copyfunc, freefunc) ((STACK_OF(SSL_COMP) *)OPENSSL_sk_deep_copy(ossl_check_const_SSL_COMP_sk_type(sk), ossl_check_SSL_COMP_copyfunc_type(copyfunc), ossl_check_SSL_COMP_freefunc_type(freefunc))) -#define sk_SSL_COMP_set_cmp_func(sk, cmp) ((sk_SSL_COMP_compfunc)OPENSSL_sk_set_cmp_func(ossl_check_SSL_COMP_sk_type(sk), ossl_check_SSL_COMP_compfunc_type(cmp))) /* compatibility */ @@ -1066,6 +1061,7 @@ typedef enum { DTLS_ST_CR_HELLO_VERIFY_REQUEST, TLS_ST_CR_SRVR_HELLO, TLS_ST_CR_CERT, + TLS_ST_CR_COMP_CERT, TLS_ST_CR_CERT_STATUS, TLS_ST_CR_KEY_EXCH, TLS_ST_CR_CERT_REQ, @@ -1075,6 +1071,7 @@ typedef enum { TLS_ST_CR_FINISHED, TLS_ST_CW_CLNT_HELLO, TLS_ST_CW_CERT, + TLS_ST_CW_COMP_CERT, TLS_ST_CW_KEY_EXCH, TLS_ST_CW_CERT_VRFY, TLS_ST_CW_CHANGE, @@ -1085,10 +1082,12 @@ typedef enum { DTLS_ST_SW_HELLO_VERIFY_REQUEST, TLS_ST_SW_SRVR_HELLO, TLS_ST_SW_CERT, + TLS_ST_SW_COMP_CERT, TLS_ST_SW_KEY_EXCH, TLS_ST_SW_CERT_REQ, TLS_ST_SW_SRVR_DONE, TLS_ST_SR_CERT, + TLS_ST_SR_COMP_CERT, TLS_ST_SR_KEY_EXCH, TLS_ST_SR_CERT_VRFY, TLS_ST_SR_NEXT_PROTO, @@ -1380,6 +1379,7 @@ DECLARE_PEM_rw(SSL_SESSION, SSL_SESSION) # define SSL_CTRL_GET_SIGNATURE_NID 132 # define SSL_CTRL_GET_TMP_KEY 133 # define SSL_CTRL_GET_NEGOTIATED_GROUP 134 +# define SSL_CTRL_GET_IANA_GROUPS 135 # define SSL_CTRL_SET_RETRY_VERIFY 136 # define SSL_CTRL_GET_VERIFY_CERT_STORE 137 # define SSL_CTRL_GET_CHAIN_CERT_STORE 138 @@ -1485,6 +1485,8 @@ DECLARE_PEM_rw(SSL_SESSION, SSL_SESSION) # define SSL_get1_groups(s, glist) \ SSL_ctrl(s,SSL_CTRL_GET_GROUPS,0,(int*)(glist)) +# define SSL_get0_iana_groups(s, plst) \ + SSL_ctrl(s,SSL_CTRL_GET_IANA_GROUPS,0,(uint16_t **)(plst)) # define SSL_CTX_set1_groups(ctx, glist, glistlen) \ SSL_CTX_ctrl(ctx,SSL_CTRL_SET_GROUPS,glistlen,(int *)(glist)) # define SSL_CTX_set1_groups_list(ctx, s) \ @@ -1549,6 +1551,7 @@ DECLARE_PEM_rw(SSL_SESSION, SSL_SESSION) # define SSL_get_max_proto_version(s) \ SSL_ctrl(s, SSL_CTRL_GET_MAX_PROTO_VERSION, 0, NULL) +const char *SSL_get0_group_name(SSL *s); const char *SSL_group_to_name(SSL *s, int id); /* Backwards compatibility, original 1.1.0 names */ @@ -1613,7 +1616,11 @@ void SSL_CTX_set1_cert_store(SSL_CTX *, X509_STORE *); __owur int SSL_want(const SSL *s); __owur int SSL_clear(SSL *s); +#ifndef OPENSSL_NO_DEPRECATED_3_4 +OSSL_DEPRECATEDIN_3_4_FOR("not Y2038-safe, replace with SSL_CTX_flush_sessions_ex()") void SSL_CTX_flush_sessions(SSL_CTX *ctx, long tm); +#endif +void SSL_CTX_flush_sessions_ex(SSL_CTX *ctx, time_t tm); __owur const SSL_CIPHER *SSL_get_current_cipher(const SSL *s); __owur const SSL_CIPHER *SSL_get_pending_cipher(const SSL *s); @@ -1725,13 +1732,21 @@ __owur const char *SSL_state_string(const SSL *s); __owur const char *SSL_rstate_string(const SSL *s); __owur const char *SSL_state_string_long(const SSL *s); __owur const char *SSL_rstate_string_long(const SSL *s); + +#ifndef OPENSSL_NO_DEPRECATED_3_4 +OSSL_DEPRECATEDIN_3_4_FOR("not Y2038-safe, replace with SSL_SESSION_get_time_ex()") __owur long SSL_SESSION_get_time(const SSL_SESSION *s); +OSSL_DEPRECATEDIN_3_4_FOR("not Y2038-safe, replace with SSL_SESSION_set_time_ex()") __owur long SSL_SESSION_set_time(SSL_SESSION *s, long t); +#endif __owur long SSL_SESSION_get_timeout(const SSL_SESSION *s); __owur long SSL_SESSION_set_timeout(SSL_SESSION *s, long t); __owur int SSL_SESSION_get_protocol_version(const SSL_SESSION *s); __owur int SSL_SESSION_set_protocol_version(SSL_SESSION *s, int version); +__owur time_t SSL_SESSION_get_time_ex(const SSL_SESSION *s); +__owur time_t SSL_SESSION_set_time_ex(SSL_SESSION *s, time_t t); + __owur const char *SSL_SESSION_get0_hostname(const SSL_SESSION *s); __owur int SSL_SESSION_set1_hostname(SSL_SESSION *s, const char *hostname); void SSL_SESSION_get0_alpn_selected(const SSL_SESSION *s, @@ -1783,6 +1798,9 @@ __owur int SSL_has_matching_session_id(const SSL *s, unsigned int id_len); SSL_SESSION *d2i_SSL_SESSION(SSL_SESSION **a, const unsigned char **pp, long length); +SSL_SESSION *d2i_SSL_SESSION_ex(SSL_SESSION **a, const unsigned char **pp, + long length, OSSL_LIB_CTX *libctx, + const char *propq); # ifdef OPENSSL_X509_H __owur X509 *SSL_get0_peer_certificate(const SSL *s); @@ -1840,6 +1858,8 @@ __owur int SSL_CTX_set_session_id_context(SSL_CTX *ctx, SSL *SSL_new(SSL_CTX *ctx); int SSL_up_ref(SSL *s); int SSL_is_dtls(const SSL *s); +int SSL_is_tls(const SSL *s); +int SSL_is_quic(const SSL *s); __owur int SSL_set_session_id_context(SSL *ssl, const unsigned char *sid_ctx, unsigned int sid_ctx_len); @@ -1932,6 +1952,8 @@ size_t SSL_client_hello_get0_ciphers(SSL *s, const unsigned char **out); size_t SSL_client_hello_get0_compression_methods(SSL *s, const unsigned char **out); int SSL_client_hello_get1_extensions_present(SSL *s, int **out, size_t *outlen); +int SSL_client_hello_get_extension_order(SSL *s, uint16_t *exts, + size_t *num_exts); int SSL_client_hello_get0_ext(SSL *s, unsigned int type, const unsigned char **out, size_t *outlen); @@ -1978,6 +2000,12 @@ long SSL_callback_ctrl(SSL *, int, void (*)(void)); long SSL_CTX_ctrl(SSL_CTX *ctx, int cmd, long larg, void *parg); long SSL_CTX_callback_ctrl(SSL_CTX *, int, void (*)(void)); +# define SSL_WRITE_FLAG_CONCLUDE (1U << 0) + +__owur int SSL_write_ex2(SSL *s, const void *buf, size_t num, + uint64_t flags, + size_t *written); + # define SSL_EARLY_DATA_NOT_SENT 0 # define SSL_EARLY_DATA_REJECTED 1 # define SSL_EARLY_DATA_ACCEPTED 2 @@ -1986,6 +2014,7 @@ __owur int SSL_get_early_data_status(const SSL *s); __owur int SSL_get_error(const SSL *s, int ret_code); __owur const char *SSL_get_version(const SSL *s); +__owur int SSL_get_handshake_rtt(const SSL *s, uint64_t *rtt); /* This sets the 'default' SSL version that SSL_new() will create */ # ifndef OPENSSL_NO_DEPRECATED_3_0 @@ -2281,6 +2310,8 @@ void SSL_CTX_set_record_padding_callback(SSL_CTX *ctx, void SSL_CTX_set_record_padding_callback_arg(SSL_CTX *ctx, void *arg); void *SSL_CTX_get_record_padding_callback_arg(const SSL_CTX *ctx); int SSL_CTX_set_block_padding(SSL_CTX *ctx, size_t block_size); +int SSL_CTX_set_block_padding_ex(SSL_CTX *ctx, size_t app_block_size, + size_t hs_block_size); int SSL_set_record_padding_callback(SSL *ssl, size_t (*cb) (SSL *ssl, int type, @@ -2288,12 +2319,230 @@ int SSL_set_record_padding_callback(SSL *ssl, void SSL_set_record_padding_callback_arg(SSL *ssl, void *arg); void *SSL_get_record_padding_callback_arg(const SSL *ssl); int SSL_set_block_padding(SSL *ssl, size_t block_size); - +int SSL_set_block_padding_ex(SSL *ssl, size_t app_block_size, + size_t hs_block_size); int SSL_set_num_tickets(SSL *s, size_t num_tickets); size_t SSL_get_num_tickets(const SSL *s); int SSL_CTX_set_num_tickets(SSL_CTX *ctx, size_t num_tickets); size_t SSL_CTX_get_num_tickets(const SSL_CTX *ctx); +/* QUIC support */ +int SSL_handle_events(SSL *s); +__owur int SSL_get_event_timeout(SSL *s, struct timeval *tv, int *is_infinite); +__owur int SSL_get_rpoll_descriptor(SSL *s, BIO_POLL_DESCRIPTOR *desc); +__owur int SSL_get_wpoll_descriptor(SSL *s, BIO_POLL_DESCRIPTOR *desc); +__owur int SSL_net_read_desired(SSL *s); +__owur int SSL_net_write_desired(SSL *s); +__owur int SSL_set_blocking_mode(SSL *s, int blocking); +__owur int SSL_get_blocking_mode(SSL *s); +__owur int SSL_set1_initial_peer_addr(SSL *s, const BIO_ADDR *peer_addr); +__owur SSL *SSL_get0_connection(SSL *s); +__owur int SSL_is_connection(SSL *s); + +#define SSL_STREAM_TYPE_NONE 0 +#define SSL_STREAM_TYPE_READ (1U << 0) +#define SSL_STREAM_TYPE_WRITE (1U << 1) +#define SSL_STREAM_TYPE_BIDI (SSL_STREAM_TYPE_READ | SSL_STREAM_TYPE_WRITE) +__owur int SSL_get_stream_type(SSL *s); + +__owur uint64_t SSL_get_stream_id(SSL *s); +__owur int SSL_is_stream_local(SSL *s); + +#define SSL_DEFAULT_STREAM_MODE_NONE 0 +#define SSL_DEFAULT_STREAM_MODE_AUTO_BIDI 1 +#define SSL_DEFAULT_STREAM_MODE_AUTO_UNI 2 +__owur int SSL_set_default_stream_mode(SSL *s, uint32_t mode); + +#define SSL_STREAM_FLAG_UNI (1U << 0) +#define SSL_STREAM_FLAG_NO_BLOCK (1U << 1) +#define SSL_STREAM_FLAG_ADVANCE (1U << 2) +__owur SSL *SSL_new_stream(SSL *s, uint64_t flags); + +#define SSL_INCOMING_STREAM_POLICY_AUTO 0 +#define SSL_INCOMING_STREAM_POLICY_ACCEPT 1 +#define SSL_INCOMING_STREAM_POLICY_REJECT 2 +__owur int SSL_set_incoming_stream_policy(SSL *s, int policy, uint64_t aec); + +#define SSL_ACCEPT_STREAM_NO_BLOCK (1U << 0) +__owur SSL *SSL_accept_stream(SSL *s, uint64_t flags); +__owur size_t SSL_get_accept_stream_queue_len(SSL *s); + +# ifndef OPENSSL_NO_QUIC +__owur int SSL_inject_net_dgram(SSL *s, const unsigned char *buf, + size_t buf_len, + const BIO_ADDR *peer, + const BIO_ADDR *local); +# endif + +typedef struct ssl_shutdown_ex_args_st { + uint64_t quic_error_code; + const char *quic_reason; +} SSL_SHUTDOWN_EX_ARGS; + +#define SSL_SHUTDOWN_FLAG_RAPID (1U << 0) +#define SSL_SHUTDOWN_FLAG_NO_STREAM_FLUSH (1U << 1) +#define SSL_SHUTDOWN_FLAG_NO_BLOCK (1U << 2) +#define SSL_SHUTDOWN_FLAG_WAIT_PEER (1U << 3) + +__owur int SSL_shutdown_ex(SSL *ssl, uint64_t flags, + const SSL_SHUTDOWN_EX_ARGS *args, + size_t args_len); + +__owur int SSL_stream_conclude(SSL *ssl, uint64_t flags); + +typedef struct ssl_stream_reset_args_st { + uint64_t quic_error_code; +} SSL_STREAM_RESET_ARGS; + +__owur int SSL_stream_reset(SSL *ssl, + const SSL_STREAM_RESET_ARGS *args, + size_t args_len); + +#define SSL_STREAM_STATE_NONE 0 +#define SSL_STREAM_STATE_OK 1 +#define SSL_STREAM_STATE_WRONG_DIR 2 +#define SSL_STREAM_STATE_FINISHED 3 +#define SSL_STREAM_STATE_RESET_LOCAL 4 +#define SSL_STREAM_STATE_RESET_REMOTE 5 +#define SSL_STREAM_STATE_CONN_CLOSED 6 +__owur int SSL_get_stream_read_state(SSL *ssl); +__owur int SSL_get_stream_write_state(SSL *ssl); + +__owur int SSL_get_stream_read_error_code(SSL *ssl, uint64_t *app_error_code); +__owur int SSL_get_stream_write_error_code(SSL *ssl, uint64_t *app_error_code); + +#define SSL_CONN_CLOSE_FLAG_LOCAL (1U << 0) +#define SSL_CONN_CLOSE_FLAG_TRANSPORT (1U << 1) + +typedef struct ssl_conn_close_info_st { + uint64_t error_code, frame_type; + const char *reason; + size_t reason_len; + uint32_t flags; +} SSL_CONN_CLOSE_INFO; + +__owur int SSL_get_conn_close_info(SSL *ssl, + SSL_CONN_CLOSE_INFO *info, + size_t info_len); + +# define SSL_VALUE_CLASS_GENERIC 0 +# define SSL_VALUE_CLASS_FEATURE_REQUEST 1 +# define SSL_VALUE_CLASS_FEATURE_PEER_REQUEST 2 +# define SSL_VALUE_CLASS_FEATURE_NEGOTIATED 3 + +# define SSL_VALUE_NONE 0 +# define SSL_VALUE_QUIC_STREAM_BIDI_LOCAL_AVAIL 1 +# define SSL_VALUE_QUIC_STREAM_BIDI_REMOTE_AVAIL 2 +# define SSL_VALUE_QUIC_STREAM_UNI_LOCAL_AVAIL 3 +# define SSL_VALUE_QUIC_STREAM_UNI_REMOTE_AVAIL 4 +# define SSL_VALUE_QUIC_IDLE_TIMEOUT 5 +# define SSL_VALUE_EVENT_HANDLING_MODE 6 +# define SSL_VALUE_STREAM_WRITE_BUF_SIZE 7 +# define SSL_VALUE_STREAM_WRITE_BUF_USED 8 +# define SSL_VALUE_STREAM_WRITE_BUF_AVAIL 9 + +# define SSL_VALUE_EVENT_HANDLING_MODE_INHERIT 0 +# define SSL_VALUE_EVENT_HANDLING_MODE_IMPLICIT 1 +# define SSL_VALUE_EVENT_HANDLING_MODE_EXPLICIT 2 + +int SSL_get_value_uint(SSL *s, uint32_t class_, uint32_t id, uint64_t *v); +int SSL_set_value_uint(SSL *s, uint32_t class_, uint32_t id, uint64_t v); + +# define SSL_get_generic_value_uint(ssl, id, v) \ + SSL_get_value_uint((ssl), SSL_VALUE_CLASS_GENERIC, (id), (v)) +# define SSL_set_generic_value_uint(ssl, id, v) \ + SSL_set_value_uint((ssl), SSL_VALUE_CLASS_GENERIC, (id), (v)) +# define SSL_get_feature_request_uint(ssl, id, v) \ + SSL_get_value_uint((ssl), SSL_VALUE_CLASS_FEATURE_REQUEST, (id), (v)) +# define SSL_set_feature_request_uint(ssl, id, v) \ + SSL_set_value_uint((ssl), SSL_VALUE_CLASS_FEATURE_REQUEST, (id), (v)) +# define SSL_get_feature_peer_request_uint(ssl, id, v) \ + SSL_get_value_uint((ssl), SSL_VALUE_CLASS_FEATURE_PEER_REQUEST, (id), (v)) +# define SSL_get_feature_negotiated_uint(ssl, id, v) \ + SSL_get_value_uint((ssl), SSL_VALUE_CLASS_FEATURE_NEGOTIATED, (id), (v)) + +# define SSL_get_quic_stream_bidi_local_avail(ssl, value) \ + SSL_get_generic_value_uint((ssl), SSL_VALUE_QUIC_STREAM_BIDI_LOCAL_AVAIL, \ + (value)) +# define SSL_get_quic_stream_bidi_remote_avail(ssl, value) \ + SSL_get_generic_value_uint((ssl), SSL_VALUE_QUIC_STREAM_BIDI_REMOTE_AVAIL, \ + (value)) +# define SSL_get_quic_stream_uni_local_avail(ssl, value) \ + SSL_get_generic_value_uint((ssl), SSL_VALUE_QUIC_STREAM_UNI_LOCAL_AVAIL, \ + (value)) +# define SSL_get_quic_stream_uni_remote_avail(ssl, value) \ + SSL_get_generic_value_uint((ssl), SSL_VALUE_QUIC_STREAM_UNI_REMOTE_AVAIL, \ + (value)) + +# define SSL_get_event_handling_mode(ssl, value) \ + SSL_get_generic_value_uint((ssl), SSL_VALUE_EVENT_HANDLING_MODE, \ + (value)) +# define SSL_set_event_handling_mode(ssl, value) \ + SSL_set_generic_value_uint((ssl), SSL_VALUE_EVENT_HANDLING_MODE, \ + (value)) + +# define SSL_get_stream_write_buf_size(ssl, value) \ + SSL_get_generic_value_uint((ssl), SSL_VALUE_STREAM_WRITE_BUF_SIZE, \ + (value)) +# define SSL_get_stream_write_buf_used(ssl, value) \ + SSL_get_generic_value_uint((ssl), SSL_VALUE_STREAM_WRITE_BUF_USED, \ + (value)) +# define SSL_get_stream_write_buf_avail(ssl, value) \ + SSL_get_generic_value_uint((ssl), SSL_VALUE_STREAM_WRITE_BUF_AVAIL, \ + (value)) + +# define SSL_POLL_EVENT_NONE 0 + +# define SSL_POLL_EVENT_F (1U << 0) /* F (Failure) */ +# define SSL_POLL_EVENT_EL (1U << 1) /* EL (Exception on Listener) */ +# define SSL_POLL_EVENT_EC (1U << 2) /* EC (Exception on Conn) */ +# define SSL_POLL_EVENT_ECD (1U << 3) /* ECD (Exception on Conn Drained) */ +# define SSL_POLL_EVENT_ER (1U << 4) /* ER (Exception on Read) */ +# define SSL_POLL_EVENT_EW (1U << 5) /* EW (Exception on Write) */ +# define SSL_POLL_EVENT_R (1U << 6) /* R (Readable) */ +# define SSL_POLL_EVENT_W (1U << 7) /* W (Writable) */ +# define SSL_POLL_EVENT_IC (1U << 8) /* IC (Incoming Connection) */ +# define SSL_POLL_EVENT_ISB (1U << 9) /* ISB (Incoming Stream: Bidi) */ +# define SSL_POLL_EVENT_ISU (1U << 10) /* ISU (Incoming Stream: Uni) */ +# define SSL_POLL_EVENT_OSB (1U << 11) /* OSB (Outgoing Stream: Bidi) */ +# define SSL_POLL_EVENT_OSU (1U << 12) /* OSU (Outgoing Stream: Uni) */ + +# define SSL_POLL_EVENT_RW (SSL_POLL_EVENT_R | SSL_POLL_EVENT_W) +# define SSL_POLL_EVENT_RE (SSL_POLL_EVENT_R | SSL_POLL_EVENT_ER) +# define SSL_POLL_EVENT_WE (SSL_POLL_EVENT_W | SSL_POLL_EVENT_EW) +# define SSL_POLL_EVENT_RWE (SSL_POLL_EVENT_RE | SSL_POLL_EVENT_WE) +# define SSL_POLL_EVENT_E (SSL_POLL_EVENT_EL | SSL_POLL_EVENT_EC \ + | SSL_POLL_EVENT_ER | SSL_POLL_EVENT_EW) +# define SSL_POLL_EVENT_IS (SSL_POLL_EVENT_ISB | SSL_POLL_EVENT_ISU) +# define SSL_POLL_EVENT_ISE (SSL_POLL_EVENT_IS | SSL_POLL_EVENT_EC) +# define SSL_POLL_EVENT_I (SSL_POLL_EVENT_IS | SSL_POLL_EVENT_IC) +# define SSL_POLL_EVENT_OS (SSL_POLL_EVENT_OSB | SSL_POLL_EVENT_OSU) +# define SSL_POLL_EVENT_OSE (SSL_POLL_EVENT_OS | SSL_POLL_EVENT_EC) + +typedef struct ssl_poll_item_st { + BIO_POLL_DESCRIPTOR desc; + uint64_t events, revents; +} SSL_POLL_ITEM; + +# define SSL_POLL_FLAG_NO_HANDLE_EVENTS (1U << 0) + +__owur int SSL_poll(SSL_POLL_ITEM *items, + size_t num_items, + size_t stride, + const struct timeval *timeout, + uint64_t flags, + size_t *result_count); + +static ossl_inline ossl_unused BIO_POLL_DESCRIPTOR +SSL_as_poll_descriptor(SSL *s) +{ + BIO_POLL_DESCRIPTOR d; + + d.type = BIO_POLL_DESCRIPTOR_TYPE_SSL; + d.value.ssl = s; + return d; +} + # ifndef OPENSSL_NO_DEPRECATED_1_1_0 # define SSL_cache_hit(s) SSL_session_reused(s) # endif @@ -2593,6 +2842,36 @@ void SSL_set_allow_early_data_cb(SSL *s, const char *OSSL_default_cipher_list(void); const char *OSSL_default_ciphersuites(void); +/* RFC8879 Certificate compression APIs */ + +int SSL_CTX_compress_certs(SSL_CTX *ctx, int alg); +int SSL_compress_certs(SSL *ssl, int alg); + +int SSL_CTX_set1_cert_comp_preference(SSL_CTX *ctx, int *algs, size_t len); +int SSL_set1_cert_comp_preference(SSL *ssl, int *algs, size_t len); + +int SSL_CTX_set1_compressed_cert(SSL_CTX *ctx, int algorithm, unsigned char *comp_data, + size_t comp_length, size_t orig_length); +int SSL_set1_compressed_cert(SSL *ssl, int algorithm, unsigned char *comp_data, + size_t comp_length, size_t orig_length); +size_t SSL_CTX_get1_compressed_cert(SSL_CTX *ctx, int alg, unsigned char **data, size_t *orig_len); +size_t SSL_get1_compressed_cert(SSL *ssl, int alg, unsigned char **data, size_t *orig_len); + +__owur int SSL_add_expected_rpk(SSL *s, EVP_PKEY *rpk); +__owur EVP_PKEY *SSL_get0_peer_rpk(const SSL *s); +__owur EVP_PKEY *SSL_SESSION_get0_peer_rpk(SSL_SESSION *s); +__owur int SSL_get_negotiated_client_cert_type(const SSL *s); +__owur int SSL_get_negotiated_server_cert_type(const SSL *s); + +__owur int SSL_set1_client_cert_type(SSL *s, const unsigned char *val, size_t len); +__owur int SSL_set1_server_cert_type(SSL *s, const unsigned char *val, size_t len); +__owur int SSL_CTX_set1_client_cert_type(SSL_CTX *ctx, const unsigned char *val, size_t len); +__owur int SSL_CTX_set1_server_cert_type(SSL_CTX *ctx, const unsigned char *val, size_t len); +__owur int SSL_get0_client_cert_type(const SSL *s, unsigned char **t, size_t *len); +__owur int SSL_get0_server_cert_type(const SSL *s, unsigned char **t, size_t *len); +__owur int SSL_CTX_get0_client_cert_type(const SSL_CTX *ctx, unsigned char **t, size_t *len); +__owur int SSL_CTX_get0_server_cert_type(const SSL_CTX *s, unsigned char **t, size_t *len); + # ifdef __cplusplus } # endif diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/x509.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/x509.h index 35fb14eff337..e07745b571bc 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/x509.h +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/x509.h @@ -2,7 +2,7 @@ * WARNING: do not edit! * Generated by Makefile from include/openssl/x509.h.in * - * Copyright 1995-2022 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 1995-2024 The OpenSSL Project Authors. All Rights Reserved. * Copyright (c) 2002, Oracle and/or its affiliates. All rights reserved * * Licensed under the Apache License 2.0 (the "License"). You may not use @@ -40,6 +40,9 @@ # include # include +# ifndef OPENSSL_NO_STDIO +# include +# endif #ifdef __cplusplus extern "C" { @@ -162,16 +165,24 @@ SKM_DEFINE_STACK_OF_INTERNAL(X509_CRL, X509_CRL, X509_CRL) # define X509_FILETYPE_ASN1 2 # define X509_FILETYPE_DEFAULT 3 -# define X509v3_KU_DIGITAL_SIGNATURE 0x0080 -# define X509v3_KU_NON_REPUDIATION 0x0040 -# define X509v3_KU_KEY_ENCIPHERMENT 0x0020 -# define X509v3_KU_DATA_ENCIPHERMENT 0x0010 -# define X509v3_KU_KEY_AGREEMENT 0x0008 -# define X509v3_KU_KEY_CERT_SIGN 0x0004 -# define X509v3_KU_CRL_SIGN 0x0002 -# define X509v3_KU_ENCIPHER_ONLY 0x0001 -# define X509v3_KU_DECIPHER_ONLY 0x8000 -# define X509v3_KU_UNDEF 0xffff +/*- + * : + * The KeyUsage BITSTRING is treated as a little-endian integer, hence bit `0` + * is 0x80, while bit `7` is 0x01 (the LSB of the integer value), bit `8` is + * then the MSB of the second octet, or 0x8000. + */ +# define X509v3_KU_DIGITAL_SIGNATURE 0x0080 /* (0) */ +# define X509v3_KU_NON_REPUDIATION 0x0040 /* (1) */ +# define X509v3_KU_KEY_ENCIPHERMENT 0x0020 /* (2) */ +# define X509v3_KU_DATA_ENCIPHERMENT 0x0010 /* (3) */ +# define X509v3_KU_KEY_AGREEMENT 0x0008 /* (4) */ +# define X509v3_KU_KEY_CERT_SIGN 0x0004 /* (5) */ +# define X509v3_KU_CRL_SIGN 0x0002 /* (6) */ +# define X509v3_KU_ENCIPHER_ONLY 0x0001 /* (7) */ +# define X509v3_KU_DECIPHER_ONLY 0x8000 /* (8) */ +# ifndef OPENSSL_NO_DEPRECATED_3_4 +# define X509v3_KU_UNDEF 0xffff /* vestigial, not used */ +# endif struct X509_algor_st { ASN1_OBJECT *algorithm; @@ -462,7 +473,12 @@ typedef struct PBKDF2PARAM_st { X509_ALGOR *prf; } PBKDF2PARAM; -#ifndef OPENSSL_NO_SCRYPT +typedef struct { + X509_ALGOR *keyDerivationFunc; + X509_ALGOR *messageAuthScheme; +} PBMAC1PARAM; + +# ifndef OPENSSL_NO_SCRYPT typedef struct SCRYPT_PARAMS_st { ASN1_OCTET_STRING *salt; ASN1_INTEGER *costParameter; @@ -470,7 +486,7 @@ typedef struct SCRYPT_PARAMS_st { ASN1_INTEGER *parallelizationParameter; ASN1_INTEGER *keyLength; } SCRYPT_PARAMS; -#endif +# endif #ifdef __cplusplus } @@ -603,6 +619,8 @@ EVP_PKEY *d2i_PrivateKey_ex_fp(FILE *fp, EVP_PKEY **a, OSSL_LIB_CTX *libctx, const char *propq); EVP_PKEY *d2i_PrivateKey_fp(FILE *fp, EVP_PKEY **a); int i2d_PUBKEY_fp(FILE *fp, const EVP_PKEY *pkey); +EVP_PKEY *d2i_PUBKEY_ex_fp(FILE *fp, EVP_PKEY **a, OSSL_LIB_CTX *libctx, + const char *propq); EVP_PKEY *d2i_PUBKEY_fp(FILE *fp, EVP_PKEY **a); # endif @@ -651,6 +669,8 @@ EVP_PKEY *d2i_PrivateKey_ex_bio(BIO *bp, EVP_PKEY **a, OSSL_LIB_CTX *libctx, const char *propq); EVP_PKEY *d2i_PrivateKey_bio(BIO *bp, EVP_PKEY **a); int i2d_PUBKEY_bio(BIO *bp, const EVP_PKEY *pkey); +EVP_PKEY *d2i_PUBKEY_ex_bio(BIO *bp, EVP_PKEY **a, OSSL_LIB_CTX *libctx, + const char *propq); EVP_PKEY *d2i_PUBKEY_bio(BIO *bp, EVP_PKEY **a); DECLARE_ASN1_DUP_FUNCTION(X509) @@ -884,12 +904,12 @@ int X509_REQ_get_signature_nid(const X509_REQ *req); int i2d_re_X509_REQ_tbs(X509_REQ *req, unsigned char **pp); int X509_REQ_set_pubkey(X509_REQ *x, EVP_PKEY *pkey); EVP_PKEY *X509_REQ_get_pubkey(X509_REQ *req); -EVP_PKEY *X509_REQ_get0_pubkey(X509_REQ *req); +EVP_PKEY *X509_REQ_get0_pubkey(const X509_REQ *req); X509_PUBKEY *X509_REQ_get_X509_PUBKEY(X509_REQ *req); int X509_REQ_extension_nid(int nid); int *X509_REQ_get_extension_nids(void); void X509_REQ_set_extension_nids(int *nids); -STACK_OF(X509_EXTENSION) *X509_REQ_get_extensions(X509_REQ *req); +STACK_OF(X509_EXTENSION) *X509_REQ_get_extensions(OSSL_FUTURE_CONST X509_REQ *req); int X509_REQ_add_extensions_nid(X509_REQ *req, const STACK_OF(X509_EXTENSION) *exts, int nid); int X509_REQ_add_extensions(X509_REQ *req, const STACK_OF(X509_EXTENSION) *ext); @@ -950,13 +970,14 @@ X509_REVOKED_get0_extensions(const X509_REVOKED *r); X509_CRL *X509_CRL_diff(X509_CRL *base, X509_CRL *newer, EVP_PKEY *skey, const EVP_MD *md, unsigned int flags); -int X509_REQ_check_private_key(X509_REQ *x509, EVP_PKEY *pkey); +int X509_REQ_check_private_key(const X509_REQ *req, EVP_PKEY *pkey); -int X509_check_private_key(const X509 *x509, const EVP_PKEY *pkey); +int X509_check_private_key(const X509 *cert, const EVP_PKEY *pkey); int X509_chain_check_suiteb(int *perror_depth, X509 *x, STACK_OF(X509) *chain, unsigned long flags); int X509_CRL_check_suiteb(X509_CRL *crl, EVP_PKEY *pk, unsigned long flags); +void OSSL_STACK_OF_X509_free(STACK_OF(X509) *certs); STACK_OF(X509) *X509_chain_up_ref(STACK_OF(X509) *chain); int X509_issuer_and_serial_cmp(const X509 *a, const X509 *b); @@ -1077,6 +1098,9 @@ X509_EXTENSION *X509v3_get_ext(const STACK_OF(X509_EXTENSION) *x, int loc); X509_EXTENSION *X509v3_delete_ext(STACK_OF(X509_EXTENSION) *x, int loc); STACK_OF(X509_EXTENSION) *X509v3_add_ext(STACK_OF(X509_EXTENSION) **x, X509_EXTENSION *ex, int loc); +STACK_OF(X509_EXTENSION) + *X509v3_add_extensions(STACK_OF(X509_EXTENSION) **target, + const STACK_OF(X509_EXTENSION) *exts); int X509_get_ext_count(const X509 *x); int X509_get_ext_by_NID(const X509 *x, int nid, int lastpos); @@ -1198,9 +1222,10 @@ X509 *X509_find_by_subject(STACK_OF(X509) *sk, const X509_NAME *name); DECLARE_ASN1_FUNCTIONS(PBEPARAM) DECLARE_ASN1_FUNCTIONS(PBE2PARAM) DECLARE_ASN1_FUNCTIONS(PBKDF2PARAM) -#ifndef OPENSSL_NO_SCRYPT +DECLARE_ASN1_FUNCTIONS(PBMAC1PARAM) +# ifndef OPENSSL_NO_SCRYPT DECLARE_ASN1_FUNCTIONS(SCRYPT_PARAMS) -#endif +# endif int PKCS5_pbe_set0_algor(X509_ALGOR *algor, int alg, int iter, const unsigned char *salt, int saltlen); @@ -1237,6 +1262,7 @@ X509_ALGOR *PKCS5_pbkdf2_set_ex(int iter, unsigned char *salt, int saltlen, int prf_nid, int keylen, OSSL_LIB_CTX *libctx); +PBKDF2PARAM *PBMAC1_get1_pbkdf2_param(const X509_ALGOR *macalg); /* PKCS#8 utilities */ DECLARE_ASN1_FUNCTIONS(PKCS8_PRIV_KEY_INFO) @@ -1262,6 +1288,8 @@ int PKCS8_pkey_add1_attr_by_OBJ(PKCS8_PRIV_KEY_INFO *p8, const ASN1_OBJECT *obj, int type, const unsigned char *bytes, int len); +void X509_PUBKEY_set0_public_key(X509_PUBKEY *pub, + unsigned char *penc, int penclen); int X509_PUBKEY_set0_param(X509_PUBKEY *pub, ASN1_OBJECT *aobj, int ptype, void *pval, unsigned char *penc, int penclen); diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/x509_acert.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/x509_acert.h new file mode 100644 index 000000000000..86babde0d87e --- /dev/null +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/x509_acert.h @@ -0,0 +1,263 @@ +/* + * WARNING: do not edit! + * Generated by Makefile from include/openssl/x509_acert.h.in + * + * Copyright 2022-2024 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + + + +#ifndef OPENSSL_X509_ACERT_H +# define OPENSSL_X509_ACERT_H +# pragma once + +# include +# include +# include + +typedef struct X509_acert_st X509_ACERT; +typedef struct X509_acert_info_st X509_ACERT_INFO; +typedef struct ossl_object_digest_info_st OSSL_OBJECT_DIGEST_INFO; +typedef struct ossl_issuer_serial_st OSSL_ISSUER_SERIAL; +typedef struct X509_acert_issuer_v2form_st X509_ACERT_ISSUER_V2FORM; + +DECLARE_ASN1_FUNCTIONS(X509_ACERT) +DECLARE_ASN1_DUP_FUNCTION(X509_ACERT) +DECLARE_ASN1_ITEM(X509_ACERT_INFO) +DECLARE_ASN1_ALLOC_FUNCTIONS(X509_ACERT_INFO) +DECLARE_ASN1_ALLOC_FUNCTIONS(OSSL_OBJECT_DIGEST_INFO) +DECLARE_ASN1_ALLOC_FUNCTIONS(OSSL_ISSUER_SERIAL) +DECLARE_ASN1_ALLOC_FUNCTIONS(X509_ACERT_ISSUER_V2FORM) + +# ifndef OPENSSL_NO_STDIO +X509_ACERT *d2i_X509_ACERT_fp(FILE *fp, X509_ACERT **acert); +int i2d_X509_ACERT_fp(FILE *fp, const X509_ACERT *acert); +# endif + +DECLARE_PEM_rw(X509_ACERT, X509_ACERT) + +X509_ACERT *d2i_X509_ACERT_bio(BIO *bp, X509_ACERT **acert); +int i2d_X509_ACERT_bio(BIO *bp, const X509_ACERT *acert); + +int X509_ACERT_sign(X509_ACERT *x, EVP_PKEY *pkey, const EVP_MD *md); +int X509_ACERT_sign_ctx(X509_ACERT *x, EVP_MD_CTX *ctx); +int X509_ACERT_verify(X509_ACERT *a, EVP_PKEY *r); + +# define X509_ACERT_VERSION_2 1 + +const GENERAL_NAMES *X509_ACERT_get0_holder_entityName(const X509_ACERT *x); +const OSSL_ISSUER_SERIAL *X509_ACERT_get0_holder_baseCertId(const X509_ACERT *x); +const OSSL_OBJECT_DIGEST_INFO * X509_ACERT_get0_holder_digest(const X509_ACERT *x); +const X509_NAME *X509_ACERT_get0_issuerName(const X509_ACERT *x); +long X509_ACERT_get_version(const X509_ACERT *x); +void X509_ACERT_get0_signature(const X509_ACERT *x, + const ASN1_BIT_STRING **psig, + const X509_ALGOR **palg); +int X509_ACERT_get_signature_nid(const X509_ACERT *x); +const X509_ALGOR *X509_ACERT_get0_info_sigalg(const X509_ACERT *x); +const ASN1_INTEGER *X509_ACERT_get0_serialNumber(const X509_ACERT *x); +const ASN1_TIME *X509_ACERT_get0_notBefore(const X509_ACERT *x); +const ASN1_TIME *X509_ACERT_get0_notAfter(const X509_ACERT *x); +const ASN1_BIT_STRING *X509_ACERT_get0_issuerUID(const X509_ACERT *x); + +int X509_ACERT_print(BIO *bp, X509_ACERT *x); +int X509_ACERT_print_ex(BIO *bp, X509_ACERT *x, unsigned long nmflags, + unsigned long cflag); + +int X509_ACERT_get_attr_count(const X509_ACERT *x); +int X509_ACERT_get_attr_by_NID(const X509_ACERT *x, int nid, int lastpos); +int X509_ACERT_get_attr_by_OBJ(const X509_ACERT *x, const ASN1_OBJECT *obj, + int lastpos); +X509_ATTRIBUTE *X509_ACERT_get_attr(const X509_ACERT *x, int loc); +X509_ATTRIBUTE *X509_ACERT_delete_attr(X509_ACERT *x, int loc); + +void *X509_ACERT_get_ext_d2i(const X509_ACERT *x, int nid, int *crit, int *idx); +int X509_ACERT_add1_ext_i2d(X509_ACERT *x, int nid, void *value, int crit, + unsigned long flags); +const STACK_OF(X509_EXTENSION) *X509_ACERT_get0_extensions(const X509_ACERT *x); + +# define OSSL_OBJECT_DIGEST_INFO_PUBLIC_KEY 0 +# define OSSL_OBJECT_DIGEST_INFO_PUBLIC_KEY_CERT 1 +# define OSSL_OBJECT_DIGEST_INFO_OTHER 2 /* must not be used in RFC 5755 profile */ +int X509_ACERT_set_version(X509_ACERT *x, long version); +void X509_ACERT_set0_holder_entityName(X509_ACERT *x, GENERAL_NAMES *name); +void X509_ACERT_set0_holder_baseCertId(X509_ACERT *x, OSSL_ISSUER_SERIAL *isss); +void X509_ACERT_set0_holder_digest(X509_ACERT *x, + OSSL_OBJECT_DIGEST_INFO *dinfo); + +int X509_ACERT_add1_attr(X509_ACERT *x, X509_ATTRIBUTE *attr); +int X509_ACERT_add1_attr_by_OBJ(X509_ACERT *x, const ASN1_OBJECT *obj, + int type, const void *bytes, int len); +int X509_ACERT_add1_attr_by_NID(X509_ACERT *x, int nid, int type, + const void *bytes, int len); +int X509_ACERT_add1_attr_by_txt(X509_ACERT *x, const char *attrname, int type, + const unsigned char *bytes, int len); +int X509_ACERT_add_attr_nconf(CONF *conf, const char *section, + X509_ACERT *acert); + +int X509_ACERT_set1_issuerName(X509_ACERT *x, const X509_NAME *name); +int X509_ACERT_set1_serialNumber(X509_ACERT *x, const ASN1_INTEGER *serial); +int X509_ACERT_set1_notBefore(X509_ACERT *x, const ASN1_GENERALIZEDTIME *time); +int X509_ACERT_set1_notAfter(X509_ACERT *x, const ASN1_GENERALIZEDTIME *time); + +void OSSL_OBJECT_DIGEST_INFO_get0_digest(const OSSL_OBJECT_DIGEST_INFO *o, + int *digestedObjectType, + const X509_ALGOR **digestAlgorithm, + const ASN1_BIT_STRING **digest); + +int OSSL_OBJECT_DIGEST_INFO_set1_digest(OSSL_OBJECT_DIGEST_INFO *o, + int digestedObjectType, + X509_ALGOR *digestAlgorithm, + ASN1_BIT_STRING *digest); + +const X509_NAME *OSSL_ISSUER_SERIAL_get0_issuer(const OSSL_ISSUER_SERIAL *isss); +const ASN1_INTEGER *OSSL_ISSUER_SERIAL_get0_serial(const OSSL_ISSUER_SERIAL *isss); +const ASN1_BIT_STRING *OSSL_ISSUER_SERIAL_get0_issuerUID(const OSSL_ISSUER_SERIAL *isss); + +int OSSL_ISSUER_SERIAL_set1_issuer(OSSL_ISSUER_SERIAL *isss, + const X509_NAME *issuer); +int OSSL_ISSUER_SERIAL_set1_serial(OSSL_ISSUER_SERIAL *isss, + const ASN1_INTEGER *serial); +int OSSL_ISSUER_SERIAL_set1_issuerUID(OSSL_ISSUER_SERIAL *isss, + const ASN1_BIT_STRING *uid); + +# define OSSL_IETFAS_OCTETS 0 +# define OSSL_IETFAS_OID 1 +# define OSSL_IETFAS_STRING 2 + +typedef struct OSSL_IETF_ATTR_SYNTAX_VALUE_st OSSL_IETF_ATTR_SYNTAX_VALUE; +typedef struct OSSL_IETF_ATTR_SYNTAX_st OSSL_IETF_ATTR_SYNTAX; +SKM_DEFINE_STACK_OF_INTERNAL(OSSL_IETF_ATTR_SYNTAX_VALUE, OSSL_IETF_ATTR_SYNTAX_VALUE, OSSL_IETF_ATTR_SYNTAX_VALUE) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_num(sk) OPENSSL_sk_num(ossl_check_const_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk)) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_value(sk, idx) ((OSSL_IETF_ATTR_SYNTAX_VALUE *)OPENSSL_sk_value(ossl_check_const_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk), (idx))) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_new(cmp) ((STACK_OF(OSSL_IETF_ATTR_SYNTAX_VALUE) *)OPENSSL_sk_new(ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_compfunc_type(cmp))) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_new_null() ((STACK_OF(OSSL_IETF_ATTR_SYNTAX_VALUE) *)OPENSSL_sk_new_null()) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_new_reserve(cmp, n) ((STACK_OF(OSSL_IETF_ATTR_SYNTAX_VALUE) *)OPENSSL_sk_new_reserve(ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_compfunc_type(cmp), (n))) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_reserve(sk, n) OPENSSL_sk_reserve(ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk), (n)) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_free(sk) OPENSSL_sk_free(ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk)) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_zero(sk) OPENSSL_sk_zero(ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk)) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_delete(sk, i) ((OSSL_IETF_ATTR_SYNTAX_VALUE *)OPENSSL_sk_delete(ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk), (i))) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_delete_ptr(sk, ptr) ((OSSL_IETF_ATTR_SYNTAX_VALUE *)OPENSSL_sk_delete_ptr(ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk), ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_type(ptr))) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_push(sk, ptr) OPENSSL_sk_push(ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk), ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_type(ptr)) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_unshift(sk, ptr) OPENSSL_sk_unshift(ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk), ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_type(ptr)) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_pop(sk) ((OSSL_IETF_ATTR_SYNTAX_VALUE *)OPENSSL_sk_pop(ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk))) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_shift(sk) ((OSSL_IETF_ATTR_SYNTAX_VALUE *)OPENSSL_sk_shift(ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk))) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_pop_free(sk, freefunc) OPENSSL_sk_pop_free(ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk),ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_freefunc_type(freefunc)) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_insert(sk, ptr, idx) OPENSSL_sk_insert(ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk), ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_type(ptr), (idx)) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_set(sk, idx, ptr) ((OSSL_IETF_ATTR_SYNTAX_VALUE *)OPENSSL_sk_set(ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk), (idx), ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_type(ptr))) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_find(sk, ptr) OPENSSL_sk_find(ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk), ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_type(ptr)) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_find_ex(sk, ptr) OPENSSL_sk_find_ex(ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk), ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_type(ptr)) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_find_all(sk, ptr, pnum) OPENSSL_sk_find_all(ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk), ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_type(ptr), pnum) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_sort(sk) OPENSSL_sk_sort(ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk)) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_is_sorted(sk) OPENSSL_sk_is_sorted(ossl_check_const_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk)) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_dup(sk) ((STACK_OF(OSSL_IETF_ATTR_SYNTAX_VALUE) *)OPENSSL_sk_dup(ossl_check_const_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk))) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_deep_copy(sk, copyfunc, freefunc) ((STACK_OF(OSSL_IETF_ATTR_SYNTAX_VALUE) *)OPENSSL_sk_deep_copy(ossl_check_const_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk), ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_copyfunc_type(copyfunc), ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_freefunc_type(freefunc))) +#define sk_OSSL_IETF_ATTR_SYNTAX_VALUE_set_cmp_func(sk, cmp) ((sk_OSSL_IETF_ATTR_SYNTAX_VALUE_compfunc)OPENSSL_sk_set_cmp_func(ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_sk_type(sk), ossl_check_OSSL_IETF_ATTR_SYNTAX_VALUE_compfunc_type(cmp))) + + +DECLARE_ASN1_ITEM(OSSL_IETF_ATTR_SYNTAX_VALUE) +DECLARE_ASN1_ALLOC_FUNCTIONS(OSSL_IETF_ATTR_SYNTAX_VALUE) +DECLARE_ASN1_FUNCTIONS(OSSL_IETF_ATTR_SYNTAX) + +const GENERAL_NAMES * +OSSL_IETF_ATTR_SYNTAX_get0_policyAuthority(const OSSL_IETF_ATTR_SYNTAX *a); +void OSSL_IETF_ATTR_SYNTAX_set0_policyAuthority(OSSL_IETF_ATTR_SYNTAX *a, + GENERAL_NAMES *names); + +int OSSL_IETF_ATTR_SYNTAX_get_value_num(const OSSL_IETF_ATTR_SYNTAX *a); +void *OSSL_IETF_ATTR_SYNTAX_get0_value(const OSSL_IETF_ATTR_SYNTAX *a, + int ind, int *type); +int OSSL_IETF_ATTR_SYNTAX_add1_value(OSSL_IETF_ATTR_SYNTAX *a, int type, + void *data); +int OSSL_IETF_ATTR_SYNTAX_print(BIO *bp, OSSL_IETF_ATTR_SYNTAX *a, int indent); + +struct TARGET_CERT_st { + OSSL_ISSUER_SERIAL *targetCertificate; + GENERAL_NAME *targetName; + OSSL_OBJECT_DIGEST_INFO *certDigestInfo; +}; + +typedef struct TARGET_CERT_st OSSL_TARGET_CERT; + +# define OSSL_TGT_TARGET_NAME 0 +# define OSSL_TGT_TARGET_GROUP 1 +# define OSSL_TGT_TARGET_CERT 2 + +typedef struct TARGET_st { + int type; + union { + GENERAL_NAME *targetName; + GENERAL_NAME *targetGroup; + OSSL_TARGET_CERT *targetCert; + } choice; +} OSSL_TARGET; + +typedef STACK_OF(OSSL_TARGET) OSSL_TARGETS; +typedef STACK_OF(OSSL_TARGETS) OSSL_TARGETING_INFORMATION; + +SKM_DEFINE_STACK_OF_INTERNAL(OSSL_TARGET, OSSL_TARGET, OSSL_TARGET) +#define sk_OSSL_TARGET_num(sk) OPENSSL_sk_num(ossl_check_const_OSSL_TARGET_sk_type(sk)) +#define sk_OSSL_TARGET_value(sk, idx) ((OSSL_TARGET *)OPENSSL_sk_value(ossl_check_const_OSSL_TARGET_sk_type(sk), (idx))) +#define sk_OSSL_TARGET_new(cmp) ((STACK_OF(OSSL_TARGET) *)OPENSSL_sk_new(ossl_check_OSSL_TARGET_compfunc_type(cmp))) +#define sk_OSSL_TARGET_new_null() ((STACK_OF(OSSL_TARGET) *)OPENSSL_sk_new_null()) +#define sk_OSSL_TARGET_new_reserve(cmp, n) ((STACK_OF(OSSL_TARGET) *)OPENSSL_sk_new_reserve(ossl_check_OSSL_TARGET_compfunc_type(cmp), (n))) +#define sk_OSSL_TARGET_reserve(sk, n) OPENSSL_sk_reserve(ossl_check_OSSL_TARGET_sk_type(sk), (n)) +#define sk_OSSL_TARGET_free(sk) OPENSSL_sk_free(ossl_check_OSSL_TARGET_sk_type(sk)) +#define sk_OSSL_TARGET_zero(sk) OPENSSL_sk_zero(ossl_check_OSSL_TARGET_sk_type(sk)) +#define sk_OSSL_TARGET_delete(sk, i) ((OSSL_TARGET *)OPENSSL_sk_delete(ossl_check_OSSL_TARGET_sk_type(sk), (i))) +#define sk_OSSL_TARGET_delete_ptr(sk, ptr) ((OSSL_TARGET *)OPENSSL_sk_delete_ptr(ossl_check_OSSL_TARGET_sk_type(sk), ossl_check_OSSL_TARGET_type(ptr))) +#define sk_OSSL_TARGET_push(sk, ptr) OPENSSL_sk_push(ossl_check_OSSL_TARGET_sk_type(sk), ossl_check_OSSL_TARGET_type(ptr)) +#define sk_OSSL_TARGET_unshift(sk, ptr) OPENSSL_sk_unshift(ossl_check_OSSL_TARGET_sk_type(sk), ossl_check_OSSL_TARGET_type(ptr)) +#define sk_OSSL_TARGET_pop(sk) ((OSSL_TARGET *)OPENSSL_sk_pop(ossl_check_OSSL_TARGET_sk_type(sk))) +#define sk_OSSL_TARGET_shift(sk) ((OSSL_TARGET *)OPENSSL_sk_shift(ossl_check_OSSL_TARGET_sk_type(sk))) +#define sk_OSSL_TARGET_pop_free(sk, freefunc) OPENSSL_sk_pop_free(ossl_check_OSSL_TARGET_sk_type(sk),ossl_check_OSSL_TARGET_freefunc_type(freefunc)) +#define sk_OSSL_TARGET_insert(sk, ptr, idx) OPENSSL_sk_insert(ossl_check_OSSL_TARGET_sk_type(sk), ossl_check_OSSL_TARGET_type(ptr), (idx)) +#define sk_OSSL_TARGET_set(sk, idx, ptr) ((OSSL_TARGET *)OPENSSL_sk_set(ossl_check_OSSL_TARGET_sk_type(sk), (idx), ossl_check_OSSL_TARGET_type(ptr))) +#define sk_OSSL_TARGET_find(sk, ptr) OPENSSL_sk_find(ossl_check_OSSL_TARGET_sk_type(sk), ossl_check_OSSL_TARGET_type(ptr)) +#define sk_OSSL_TARGET_find_ex(sk, ptr) OPENSSL_sk_find_ex(ossl_check_OSSL_TARGET_sk_type(sk), ossl_check_OSSL_TARGET_type(ptr)) +#define sk_OSSL_TARGET_find_all(sk, ptr, pnum) OPENSSL_sk_find_all(ossl_check_OSSL_TARGET_sk_type(sk), ossl_check_OSSL_TARGET_type(ptr), pnum) +#define sk_OSSL_TARGET_sort(sk) OPENSSL_sk_sort(ossl_check_OSSL_TARGET_sk_type(sk)) +#define sk_OSSL_TARGET_is_sorted(sk) OPENSSL_sk_is_sorted(ossl_check_const_OSSL_TARGET_sk_type(sk)) +#define sk_OSSL_TARGET_dup(sk) ((STACK_OF(OSSL_TARGET) *)OPENSSL_sk_dup(ossl_check_const_OSSL_TARGET_sk_type(sk))) +#define sk_OSSL_TARGET_deep_copy(sk, copyfunc, freefunc) ((STACK_OF(OSSL_TARGET) *)OPENSSL_sk_deep_copy(ossl_check_const_OSSL_TARGET_sk_type(sk), ossl_check_OSSL_TARGET_copyfunc_type(copyfunc), ossl_check_OSSL_TARGET_freefunc_type(freefunc))) +#define sk_OSSL_TARGET_set_cmp_func(sk, cmp) ((sk_OSSL_TARGET_compfunc)OPENSSL_sk_set_cmp_func(ossl_check_OSSL_TARGET_sk_type(sk), ossl_check_OSSL_TARGET_compfunc_type(cmp))) + + +SKM_DEFINE_STACK_OF_INTERNAL(OSSL_TARGETS, OSSL_TARGETS, OSSL_TARGETS) +#define sk_OSSL_TARGETS_num(sk) OPENSSL_sk_num(ossl_check_const_OSSL_TARGETS_sk_type(sk)) +#define sk_OSSL_TARGETS_value(sk, idx) ((OSSL_TARGETS *)OPENSSL_sk_value(ossl_check_const_OSSL_TARGETS_sk_type(sk), (idx))) +#define sk_OSSL_TARGETS_new(cmp) ((STACK_OF(OSSL_TARGETS) *)OPENSSL_sk_new(ossl_check_OSSL_TARGETS_compfunc_type(cmp))) +#define sk_OSSL_TARGETS_new_null() ((STACK_OF(OSSL_TARGETS) *)OPENSSL_sk_new_null()) +#define sk_OSSL_TARGETS_new_reserve(cmp, n) ((STACK_OF(OSSL_TARGETS) *)OPENSSL_sk_new_reserve(ossl_check_OSSL_TARGETS_compfunc_type(cmp), (n))) +#define sk_OSSL_TARGETS_reserve(sk, n) OPENSSL_sk_reserve(ossl_check_OSSL_TARGETS_sk_type(sk), (n)) +#define sk_OSSL_TARGETS_free(sk) OPENSSL_sk_free(ossl_check_OSSL_TARGETS_sk_type(sk)) +#define sk_OSSL_TARGETS_zero(sk) OPENSSL_sk_zero(ossl_check_OSSL_TARGETS_sk_type(sk)) +#define sk_OSSL_TARGETS_delete(sk, i) ((OSSL_TARGETS *)OPENSSL_sk_delete(ossl_check_OSSL_TARGETS_sk_type(sk), (i))) +#define sk_OSSL_TARGETS_delete_ptr(sk, ptr) ((OSSL_TARGETS *)OPENSSL_sk_delete_ptr(ossl_check_OSSL_TARGETS_sk_type(sk), ossl_check_OSSL_TARGETS_type(ptr))) +#define sk_OSSL_TARGETS_push(sk, ptr) OPENSSL_sk_push(ossl_check_OSSL_TARGETS_sk_type(sk), ossl_check_OSSL_TARGETS_type(ptr)) +#define sk_OSSL_TARGETS_unshift(sk, ptr) OPENSSL_sk_unshift(ossl_check_OSSL_TARGETS_sk_type(sk), ossl_check_OSSL_TARGETS_type(ptr)) +#define sk_OSSL_TARGETS_pop(sk) ((OSSL_TARGETS *)OPENSSL_sk_pop(ossl_check_OSSL_TARGETS_sk_type(sk))) +#define sk_OSSL_TARGETS_shift(sk) ((OSSL_TARGETS *)OPENSSL_sk_shift(ossl_check_OSSL_TARGETS_sk_type(sk))) +#define sk_OSSL_TARGETS_pop_free(sk, freefunc) OPENSSL_sk_pop_free(ossl_check_OSSL_TARGETS_sk_type(sk),ossl_check_OSSL_TARGETS_freefunc_type(freefunc)) +#define sk_OSSL_TARGETS_insert(sk, ptr, idx) OPENSSL_sk_insert(ossl_check_OSSL_TARGETS_sk_type(sk), ossl_check_OSSL_TARGETS_type(ptr), (idx)) +#define sk_OSSL_TARGETS_set(sk, idx, ptr) ((OSSL_TARGETS *)OPENSSL_sk_set(ossl_check_OSSL_TARGETS_sk_type(sk), (idx), ossl_check_OSSL_TARGETS_type(ptr))) +#define sk_OSSL_TARGETS_find(sk, ptr) OPENSSL_sk_find(ossl_check_OSSL_TARGETS_sk_type(sk), ossl_check_OSSL_TARGETS_type(ptr)) +#define sk_OSSL_TARGETS_find_ex(sk, ptr) OPENSSL_sk_find_ex(ossl_check_OSSL_TARGETS_sk_type(sk), ossl_check_OSSL_TARGETS_type(ptr)) +#define sk_OSSL_TARGETS_find_all(sk, ptr, pnum) OPENSSL_sk_find_all(ossl_check_OSSL_TARGETS_sk_type(sk), ossl_check_OSSL_TARGETS_type(ptr), pnum) +#define sk_OSSL_TARGETS_sort(sk) OPENSSL_sk_sort(ossl_check_OSSL_TARGETS_sk_type(sk)) +#define sk_OSSL_TARGETS_is_sorted(sk) OPENSSL_sk_is_sorted(ossl_check_const_OSSL_TARGETS_sk_type(sk)) +#define sk_OSSL_TARGETS_dup(sk) ((STACK_OF(OSSL_TARGETS) *)OPENSSL_sk_dup(ossl_check_const_OSSL_TARGETS_sk_type(sk))) +#define sk_OSSL_TARGETS_deep_copy(sk, copyfunc, freefunc) ((STACK_OF(OSSL_TARGETS) *)OPENSSL_sk_deep_copy(ossl_check_const_OSSL_TARGETS_sk_type(sk), ossl_check_OSSL_TARGETS_copyfunc_type(copyfunc), ossl_check_OSSL_TARGETS_freefunc_type(freefunc))) +#define sk_OSSL_TARGETS_set_cmp_func(sk, cmp) ((sk_OSSL_TARGETS_compfunc)OPENSSL_sk_set_cmp_func(ossl_check_OSSL_TARGETS_sk_type(sk), ossl_check_OSSL_TARGETS_compfunc_type(cmp))) + + +DECLARE_ASN1_FUNCTIONS(OSSL_TARGET) +DECLARE_ASN1_FUNCTIONS(OSSL_TARGETS) +DECLARE_ASN1_FUNCTIONS(OSSL_TARGETING_INFORMATION) + +#endif diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/x509_vfy.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/x509_vfy.h index 5156a1f320c8..68b20ee5f2d6 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/x509_vfy.h +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/x509_vfy.h @@ -2,7 +2,7 @@ * WARNING: do not edit! * Generated by Makefile from include/openssl/x509_vfy.h.in * - * Copyright 1995-2021 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 1995-2024 The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the Apache License 2.0 (the "License"). You may not use * this file except in compliance with the License. You can obtain a copy @@ -411,6 +411,7 @@ X509_LOOKUP_ctrl_ex((x), X509_L_ADD_STORE, (name), 0, NULL, \ # define X509_V_ERR_CA_CERT_MISSING_KEY_USAGE 92 # define X509_V_ERR_EXTENSIONS_REQUIRE_VERSION_3 93 # define X509_V_ERR_EC_KEY_EXPLICIT_PARAMS 94 +# define X509_V_ERR_RPK_UNTRUSTED 95 /* Certificate verify flags */ # ifndef OPENSSL_NO_DEPRECATED_1_1_0 @@ -491,71 +492,72 @@ int X509_OBJECT_set1_X509(X509_OBJECT *a, X509 *obj); X509_CRL *X509_OBJECT_get0_X509_CRL(const X509_OBJECT *a); int X509_OBJECT_set1_X509_CRL(X509_OBJECT *a, X509_CRL *obj); X509_STORE *X509_STORE_new(void); -void X509_STORE_free(X509_STORE *v); -int X509_STORE_lock(X509_STORE *ctx); -int X509_STORE_unlock(X509_STORE *ctx); -int X509_STORE_up_ref(X509_STORE *v); -STACK_OF(X509_OBJECT) *X509_STORE_get0_objects(const X509_STORE *v); -STACK_OF(X509) *X509_STORE_get1_all_certs(X509_STORE *st); -STACK_OF(X509) *X509_STORE_CTX_get1_certs(X509_STORE_CTX *st, +void X509_STORE_free(X509_STORE *xs); +int X509_STORE_lock(X509_STORE *xs); +int X509_STORE_unlock(X509_STORE *xs); +int X509_STORE_up_ref(X509_STORE *xs); +STACK_OF(X509_OBJECT) *X509_STORE_get0_objects(const X509_STORE *xs); +STACK_OF(X509_OBJECT) *X509_STORE_get1_objects(X509_STORE *xs); +STACK_OF(X509) *X509_STORE_get1_all_certs(X509_STORE *xs); +STACK_OF(X509) *X509_STORE_CTX_get1_certs(X509_STORE_CTX *xs, const X509_NAME *nm); STACK_OF(X509_CRL) *X509_STORE_CTX_get1_crls(const X509_STORE_CTX *st, const X509_NAME *nm); -int X509_STORE_set_flags(X509_STORE *ctx, unsigned long flags); -int X509_STORE_set_purpose(X509_STORE *ctx, int purpose); -int X509_STORE_set_trust(X509_STORE *ctx, int trust); -int X509_STORE_set1_param(X509_STORE *ctx, const X509_VERIFY_PARAM *pm); -X509_VERIFY_PARAM *X509_STORE_get0_param(const X509_STORE *ctx); +int X509_STORE_set_flags(X509_STORE *xs, unsigned long flags); +int X509_STORE_set_purpose(X509_STORE *xs, int purpose); +int X509_STORE_set_trust(X509_STORE *xs, int trust); +int X509_STORE_set1_param(X509_STORE *xs, const X509_VERIFY_PARAM *pm); +X509_VERIFY_PARAM *X509_STORE_get0_param(const X509_STORE *xs); -void X509_STORE_set_verify(X509_STORE *ctx, X509_STORE_CTX_verify_fn verify); +void X509_STORE_set_verify(X509_STORE *xs, X509_STORE_CTX_verify_fn verify); #define X509_STORE_set_verify_func(ctx, func) \ X509_STORE_set_verify((ctx),(func)) void X509_STORE_CTX_set_verify(X509_STORE_CTX *ctx, X509_STORE_CTX_verify_fn verify); -X509_STORE_CTX_verify_fn X509_STORE_get_verify(const X509_STORE *ctx); -void X509_STORE_set_verify_cb(X509_STORE *ctx, +X509_STORE_CTX_verify_fn X509_STORE_get_verify(const X509_STORE *xs); +void X509_STORE_set_verify_cb(X509_STORE *xs, X509_STORE_CTX_verify_cb verify_cb); # define X509_STORE_set_verify_cb_func(ctx,func) \ X509_STORE_set_verify_cb((ctx),(func)) -X509_STORE_CTX_verify_cb X509_STORE_get_verify_cb(const X509_STORE *ctx); -void X509_STORE_set_get_issuer(X509_STORE *ctx, +X509_STORE_CTX_verify_cb X509_STORE_get_verify_cb(const X509_STORE *xs); +void X509_STORE_set_get_issuer(X509_STORE *xs, X509_STORE_CTX_get_issuer_fn get_issuer); -X509_STORE_CTX_get_issuer_fn X509_STORE_get_get_issuer(const X509_STORE *ctx); -void X509_STORE_set_check_issued(X509_STORE *ctx, +X509_STORE_CTX_get_issuer_fn X509_STORE_get_get_issuer(const X509_STORE *xs); +void X509_STORE_set_check_issued(X509_STORE *xs, X509_STORE_CTX_check_issued_fn check_issued); -X509_STORE_CTX_check_issued_fn X509_STORE_get_check_issued(const X509_STORE *ctx); -void X509_STORE_set_check_revocation(X509_STORE *ctx, +X509_STORE_CTX_check_issued_fn X509_STORE_get_check_issued(const X509_STORE *s); +void X509_STORE_set_check_revocation(X509_STORE *xs, X509_STORE_CTX_check_revocation_fn check_revocation); X509_STORE_CTX_check_revocation_fn - X509_STORE_get_check_revocation(const X509_STORE *ctx); -void X509_STORE_set_get_crl(X509_STORE *ctx, + X509_STORE_get_check_revocation(const X509_STORE *xs); +void X509_STORE_set_get_crl(X509_STORE *xs, X509_STORE_CTX_get_crl_fn get_crl); -X509_STORE_CTX_get_crl_fn X509_STORE_get_get_crl(const X509_STORE *ctx); -void X509_STORE_set_check_crl(X509_STORE *ctx, +X509_STORE_CTX_get_crl_fn X509_STORE_get_get_crl(const X509_STORE *xs); +void X509_STORE_set_check_crl(X509_STORE *xs, X509_STORE_CTX_check_crl_fn check_crl); -X509_STORE_CTX_check_crl_fn X509_STORE_get_check_crl(const X509_STORE *ctx); -void X509_STORE_set_cert_crl(X509_STORE *ctx, +X509_STORE_CTX_check_crl_fn X509_STORE_get_check_crl(const X509_STORE *xs); +void X509_STORE_set_cert_crl(X509_STORE *xs, X509_STORE_CTX_cert_crl_fn cert_crl); -X509_STORE_CTX_cert_crl_fn X509_STORE_get_cert_crl(const X509_STORE *ctx); -void X509_STORE_set_check_policy(X509_STORE *ctx, +X509_STORE_CTX_cert_crl_fn X509_STORE_get_cert_crl(const X509_STORE *xs); +void X509_STORE_set_check_policy(X509_STORE *xs, X509_STORE_CTX_check_policy_fn check_policy); -X509_STORE_CTX_check_policy_fn X509_STORE_get_check_policy(const X509_STORE *ctx); -void X509_STORE_set_lookup_certs(X509_STORE *ctx, +X509_STORE_CTX_check_policy_fn X509_STORE_get_check_policy(const X509_STORE *s); +void X509_STORE_set_lookup_certs(X509_STORE *xs, X509_STORE_CTX_lookup_certs_fn lookup_certs); -X509_STORE_CTX_lookup_certs_fn X509_STORE_get_lookup_certs(const X509_STORE *ctx); -void X509_STORE_set_lookup_crls(X509_STORE *ctx, +X509_STORE_CTX_lookup_certs_fn X509_STORE_get_lookup_certs(const X509_STORE *s); +void X509_STORE_set_lookup_crls(X509_STORE *xs, X509_STORE_CTX_lookup_crls_fn lookup_crls); #define X509_STORE_set_lookup_crls_cb(ctx, func) \ X509_STORE_set_lookup_crls((ctx), (func)) -X509_STORE_CTX_lookup_crls_fn X509_STORE_get_lookup_crls(const X509_STORE *ctx); -void X509_STORE_set_cleanup(X509_STORE *ctx, +X509_STORE_CTX_lookup_crls_fn X509_STORE_get_lookup_crls(const X509_STORE *xs); +void X509_STORE_set_cleanup(X509_STORE *xs, X509_STORE_CTX_cleanup_fn cleanup); -X509_STORE_CTX_cleanup_fn X509_STORE_get_cleanup(const X509_STORE *ctx); +X509_STORE_CTX_cleanup_fn X509_STORE_get_cleanup(const X509_STORE *xs); #define X509_STORE_get_ex_new_index(l, p, newf, dupf, freef) \ CRYPTO_get_ex_new_index(CRYPTO_EX_INDEX_X509_STORE, l, p, newf, dupf, freef) -int X509_STORE_set_ex_data(X509_STORE *ctx, int idx, void *data); -void *X509_STORE_get_ex_data(const X509_STORE *ctx, int idx); +int X509_STORE_set_ex_data(X509_STORE *xs, int idx, void *data); +void *X509_STORE_get_ex_data(const X509_STORE *xs, int idx); X509_STORE_CTX *X509_STORE_CTX_new_ex(OSSL_LIB_CTX *libctx, const char *propq); X509_STORE_CTX *X509_STORE_CTX_new(void); @@ -565,11 +567,14 @@ int X509_STORE_CTX_get1_issuer(X509 **issuer, X509_STORE_CTX *ctx, X509 *x); void X509_STORE_CTX_free(X509_STORE_CTX *ctx); int X509_STORE_CTX_init(X509_STORE_CTX *ctx, X509_STORE *trust_store, X509 *target, STACK_OF(X509) *untrusted); +int X509_STORE_CTX_init_rpk(X509_STORE_CTX *ctx, X509_STORE *trust_store, + EVP_PKEY* rpk); void X509_STORE_CTX_set0_trusted_stack(X509_STORE_CTX *ctx, STACK_OF(X509) *sk); void X509_STORE_CTX_cleanup(X509_STORE_CTX *ctx); X509_STORE *X509_STORE_CTX_get0_store(const X509_STORE_CTX *ctx); X509 *X509_STORE_CTX_get0_cert(const X509_STORE_CTX *ctx); +EVP_PKEY *X509_STORE_CTX_get0_rpk(const X509_STORE_CTX *ctx); STACK_OF(X509)* X509_STORE_CTX_get0_untrusted(const X509_STORE_CTX *ctx); void X509_STORE_CTX_set0_untrusted(X509_STORE_CTX *ctx, STACK_OF(X509) *sk); void X509_STORE_CTX_set_verify_cb(X509_STORE_CTX *ctx, @@ -579,6 +584,8 @@ X509_STORE_CTX_verify_fn X509_STORE_CTX_get_verify(const X509_STORE_CTX *ctx); X509_STORE_CTX_get_issuer_fn X509_STORE_CTX_get_get_issuer(const X509_STORE_CTX *ctx); X509_STORE_CTX_check_issued_fn X509_STORE_CTX_get_check_issued(const X509_STORE_CTX *ctx); X509_STORE_CTX_check_revocation_fn X509_STORE_CTX_get_check_revocation(const X509_STORE_CTX *ctx); +void X509_STORE_CTX_set_get_crl(X509_STORE_CTX *ctx, + X509_STORE_CTX_get_crl_fn get_crl); X509_STORE_CTX_get_crl_fn X509_STORE_CTX_get_get_crl(const X509_STORE_CTX *ctx); X509_STORE_CTX_check_crl_fn X509_STORE_CTX_get_check_crl(const X509_STORE_CTX *ctx); X509_STORE_CTX_cert_crl_fn X509_STORE_CTX_get_cert_crl(const X509_STORE_CTX *ctx); @@ -600,7 +607,7 @@ X509_STORE_CTX_cleanup_fn X509_STORE_CTX_get_cleanup(const X509_STORE_CTX *ctx); # define X509_STORE_get1_crl X509_STORE_CTX_get1_crls #endif -X509_LOOKUP *X509_STORE_add_lookup(X509_STORE *v, X509_LOOKUP_METHOD *m); +X509_LOOKUP *X509_STORE_add_lookup(X509_STORE *xs, X509_LOOKUP_METHOD *m); X509_LOOKUP_METHOD *X509_LOOKUP_hash_dir(void); X509_LOOKUP_METHOD *X509_LOOKUP_file(void); X509_LOOKUP_METHOD *X509_LOOKUP_store(void); @@ -685,8 +692,8 @@ X509_LOOKUP_get_by_alias_fn X509_LOOKUP_meth_get_get_by_alias( const X509_LOOKUP_METHOD *method); -int X509_STORE_add_cert(X509_STORE *ctx, X509 *x); -int X509_STORE_add_crl(X509_STORE *ctx, X509_CRL *x); +int X509_STORE_add_cert(X509_STORE *xs, X509 *x); +int X509_STORE_add_crl(X509_STORE *xs, X509_CRL *x); int X509_STORE_CTX_get_by_subject(const X509_STORE_CTX *vs, X509_LOOKUP_TYPE type, @@ -730,23 +737,21 @@ void *X509_LOOKUP_get_method_data(const X509_LOOKUP *ctx); X509_STORE *X509_LOOKUP_get_store(const X509_LOOKUP *ctx); int X509_LOOKUP_shutdown(X509_LOOKUP *ctx); -int X509_STORE_load_file(X509_STORE *ctx, const char *file); -int X509_STORE_load_path(X509_STORE *ctx, const char *path); -int X509_STORE_load_store(X509_STORE *ctx, const char *store); -int X509_STORE_load_locations(X509_STORE *ctx, - const char *file, - const char *dir); -int X509_STORE_set_default_paths(X509_STORE *ctx); +int X509_STORE_load_file(X509_STORE *xs, const char *file); +int X509_STORE_load_path(X509_STORE *xs, const char *path); +int X509_STORE_load_store(X509_STORE *xs, const char *store); +int X509_STORE_load_locations(X509_STORE *s, const char *file, const char *dir); +int X509_STORE_set_default_paths(X509_STORE *xs); -int X509_STORE_load_file_ex(X509_STORE *ctx, const char *file, +int X509_STORE_load_file_ex(X509_STORE *xs, const char *file, OSSL_LIB_CTX *libctx, const char *propq); -int X509_STORE_load_store_ex(X509_STORE *ctx, const char *store, +int X509_STORE_load_store_ex(X509_STORE *xs, const char *store, OSSL_LIB_CTX *libctx, const char *propq); -int X509_STORE_load_locations_ex(X509_STORE *ctx, const char *file, - const char *dir, OSSL_LIB_CTX *libctx, - const char *propq); -int X509_STORE_set_default_paths_ex(X509_STORE *ctx, OSSL_LIB_CTX *libctx, - const char *propq); +int X509_STORE_load_locations_ex(X509_STORE *xs, + const char *file, const char *dir, + OSSL_LIB_CTX *libctx, const char *propq); +int X509_STORE_set_default_paths_ex(X509_STORE *xs, + OSSL_LIB_CTX *libctx, const char *propq); #define X509_STORE_CTX_get_ex_new_index(l, p, newf, dupf, freef) \ CRYPTO_get_ex_new_index(CRYPTO_EX_INDEX_X509_STORE_CTX, l, p, newf, dupf, freef) @@ -764,6 +769,7 @@ X509_STORE_CTX *X509_STORE_CTX_get0_parent_ctx(const X509_STORE_CTX *ctx); STACK_OF(X509) *X509_STORE_CTX_get0_chain(const X509_STORE_CTX *ctx); STACK_OF(X509) *X509_STORE_CTX_get1_chain(const X509_STORE_CTX *ctx); void X509_STORE_CTX_set_cert(X509_STORE_CTX *ctx, X509 *target); +void X509_STORE_CTX_set0_rpk(X509_STORE_CTX *ctx, EVP_PKEY *target); void X509_STORE_CTX_set0_verified_chain(X509_STORE_CTX *c, STACK_OF(X509) *sk); void X509_STORE_CTX_set0_crls(X509_STORE_CTX *ctx, STACK_OF(X509_CRL) *sk); int X509_STORE_CTX_set_purpose(X509_STORE_CTX *ctx, int purpose); @@ -773,6 +779,8 @@ int X509_STORE_CTX_purpose_inherit(X509_STORE_CTX *ctx, int def_purpose, void X509_STORE_CTX_set_flags(X509_STORE_CTX *ctx, unsigned long flags); void X509_STORE_CTX_set_time(X509_STORE_CTX *ctx, unsigned long flags, time_t t); +void X509_STORE_CTX_set_current_reasons(X509_STORE_CTX *ctx, + unsigned int current_reasons); X509_POLICY_TREE *X509_STORE_CTX_get0_policy_tree(const X509_STORE_CTX *ctx); int X509_STORE_CTX_get_explicit_policy(const X509_STORE_CTX *ctx); diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/x509v3.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/x509v3.h index eb8501db90b8..5fd66fbda363 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/x509v3.h +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/x509v3.h @@ -2,7 +2,7 @@ * WARNING: do not edit! * Generated by Makefile from include/openssl/x509v3.h.in * - * Copyright 1999-2023 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 1999-2024 The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the Apache License 2.0 (the "License"). You may not use * this file except in compliance with the License. You can obtain a copy @@ -25,6 +25,9 @@ # include # include # include +# ifndef OPENSSL_NO_STDIO +# include +# endif #ifdef __cplusplus extern "C" { @@ -146,6 +149,11 @@ typedef struct BASIC_CONSTRAINTS_st { ASN1_INTEGER *pathlen; } BASIC_CONSTRAINTS; +typedef struct OSSL_BASIC_ATTR_CONSTRAINTS_st { + int authority; + ASN1_INTEGER *pathlen; +} OSSL_BASIC_ATTR_CONSTRAINTS; + typedef struct PKEY_USAGE_PERIOD_st { ASN1_GENERALIZEDTIME *notBefore; ASN1_GENERALIZEDTIME *notAfter; @@ -198,6 +206,8 @@ typedef struct ACCESS_DESCRIPTION_st { GENERAL_NAME *location; } ACCESS_DESCRIPTION; +int GENERAL_NAME_set1_X509_NAME(GENERAL_NAME **tgt, const X509_NAME *src); + SKM_DEFINE_STACK_OF_INTERNAL(ACCESS_DESCRIPTION, ACCESS_DESCRIPTION, ACCESS_DESCRIPTION) #define sk_ACCESS_DESCRIPTION_num(sk) OPENSSL_sk_num(ossl_check_const_ACCESS_DESCRIPTION_sk_type(sk)) #define sk_ACCESS_DESCRIPTION_value(sk, idx) ((ACCESS_DESCRIPTION *)OPENSSL_sk_value(ossl_check_const_ACCESS_DESCRIPTION_sk_type(sk), (idx))) @@ -294,6 +304,7 @@ typedef struct DIST_POINT_NAME_st { /* If relativename then this contains the full distribution point name */ X509_NAME *dpname; } DIST_POINT_NAME; +DECLARE_ASN1_DUP_FUNCTION(DIST_POINT_NAME) /* All existing reasons */ # define CRLDP_ALL_REASONS 0x807f @@ -659,15 +670,16 @@ struct ISSUING_DIST_POINT_st { # define EXFLAG_SAN_CRITICAL 0x80000 # define EXFLAG_NO_FINGERPRINT 0x100000 -# define KU_DIGITAL_SIGNATURE 0x0080 -# define KU_NON_REPUDIATION 0x0040 -# define KU_KEY_ENCIPHERMENT 0x0020 -# define KU_DATA_ENCIPHERMENT 0x0010 -# define KU_KEY_AGREEMENT 0x0008 -# define KU_KEY_CERT_SIGN 0x0004 -# define KU_CRL_SIGN 0x0002 -# define KU_ENCIPHER_ONLY 0x0001 -# define KU_DECIPHER_ONLY 0x8000 +/* https://datatracker.ietf.org/doc/html/rfc5280#section-4.2.1.3 */ +# define KU_DIGITAL_SIGNATURE X509v3_KU_DIGITAL_SIGNATURE +# define KU_NON_REPUDIATION X509v3_KU_NON_REPUDIATION +# define KU_KEY_ENCIPHERMENT X509v3_KU_KEY_ENCIPHERMENT +# define KU_DATA_ENCIPHERMENT X509v3_KU_DATA_ENCIPHERMENT +# define KU_KEY_AGREEMENT X509v3_KU_KEY_AGREEMENT +# define KU_KEY_CERT_SIGN X509v3_KU_KEY_CERT_SIGN +# define KU_CRL_SIGN X509v3_KU_CRL_SIGN +# define KU_ENCIPHER_ONLY X509v3_KU_ENCIPHER_ONLY +# define KU_DECIPHER_ONLY X509v3_KU_DECIPHER_ONLY # define NS_SSL_CLIENT 0x80 # define NS_SSL_SERVER 0x40 @@ -739,9 +751,10 @@ SKM_DEFINE_STACK_OF_INTERNAL(X509_PURPOSE, X509_PURPOSE, X509_PURPOSE) # define X509_PURPOSE_ANY 7 # define X509_PURPOSE_OCSP_HELPER 8 # define X509_PURPOSE_TIMESTAMP_SIGN 9 +# define X509_PURPOSE_CODE_SIGN 10 # define X509_PURPOSE_MIN 1 -# define X509_PURPOSE_MAX 9 +# define X509_PURPOSE_MAX 10 /* Flags for X509V3_EXT_print() */ @@ -767,6 +780,7 @@ SKM_DEFINE_STACK_OF_INTERNAL(X509_PURPOSE, X509_PURPOSE, X509_PURPOSE) # define X509V3_ADD_SILENT 0x10 DECLARE_ASN1_FUNCTIONS(BASIC_CONSTRAINTS) +DECLARE_ASN1_FUNCTIONS(OSSL_BASIC_ATTR_CONSTRAINTS) DECLARE_ASN1_FUNCTIONS(SXNET) DECLARE_ASN1_FUNCTIONS(SXNETID) @@ -1444,6 +1458,42 @@ const ASN1_PRINTABLESTRING *PROFESSION_INFO_get0_registrationNumber( void PROFESSION_INFO_set0_registrationNumber( PROFESSION_INFO *pi, ASN1_PRINTABLESTRING *rn); +int OSSL_GENERAL_NAMES_print(BIO *out, GENERAL_NAMES *gens, int indent); + +typedef STACK_OF(X509_ATTRIBUTE) OSSL_ATTRIBUTES_SYNTAX; +DECLARE_ASN1_FUNCTIONS(OSSL_ATTRIBUTES_SYNTAX) + +typedef STACK_OF(USERNOTICE) OSSL_USER_NOTICE_SYNTAX; +DECLARE_ASN1_FUNCTIONS(OSSL_USER_NOTICE_SYNTAX) + +SKM_DEFINE_STACK_OF_INTERNAL(USERNOTICE, USERNOTICE, USERNOTICE) +#define sk_USERNOTICE_num(sk) OPENSSL_sk_num(ossl_check_const_USERNOTICE_sk_type(sk)) +#define sk_USERNOTICE_value(sk, idx) ((USERNOTICE *)OPENSSL_sk_value(ossl_check_const_USERNOTICE_sk_type(sk), (idx))) +#define sk_USERNOTICE_new(cmp) ((STACK_OF(USERNOTICE) *)OPENSSL_sk_new(ossl_check_USERNOTICE_compfunc_type(cmp))) +#define sk_USERNOTICE_new_null() ((STACK_OF(USERNOTICE) *)OPENSSL_sk_new_null()) +#define sk_USERNOTICE_new_reserve(cmp, n) ((STACK_OF(USERNOTICE) *)OPENSSL_sk_new_reserve(ossl_check_USERNOTICE_compfunc_type(cmp), (n))) +#define sk_USERNOTICE_reserve(sk, n) OPENSSL_sk_reserve(ossl_check_USERNOTICE_sk_type(sk), (n)) +#define sk_USERNOTICE_free(sk) OPENSSL_sk_free(ossl_check_USERNOTICE_sk_type(sk)) +#define sk_USERNOTICE_zero(sk) OPENSSL_sk_zero(ossl_check_USERNOTICE_sk_type(sk)) +#define sk_USERNOTICE_delete(sk, i) ((USERNOTICE *)OPENSSL_sk_delete(ossl_check_USERNOTICE_sk_type(sk), (i))) +#define sk_USERNOTICE_delete_ptr(sk, ptr) ((USERNOTICE *)OPENSSL_sk_delete_ptr(ossl_check_USERNOTICE_sk_type(sk), ossl_check_USERNOTICE_type(ptr))) +#define sk_USERNOTICE_push(sk, ptr) OPENSSL_sk_push(ossl_check_USERNOTICE_sk_type(sk), ossl_check_USERNOTICE_type(ptr)) +#define sk_USERNOTICE_unshift(sk, ptr) OPENSSL_sk_unshift(ossl_check_USERNOTICE_sk_type(sk), ossl_check_USERNOTICE_type(ptr)) +#define sk_USERNOTICE_pop(sk) ((USERNOTICE *)OPENSSL_sk_pop(ossl_check_USERNOTICE_sk_type(sk))) +#define sk_USERNOTICE_shift(sk) ((USERNOTICE *)OPENSSL_sk_shift(ossl_check_USERNOTICE_sk_type(sk))) +#define sk_USERNOTICE_pop_free(sk, freefunc) OPENSSL_sk_pop_free(ossl_check_USERNOTICE_sk_type(sk),ossl_check_USERNOTICE_freefunc_type(freefunc)) +#define sk_USERNOTICE_insert(sk, ptr, idx) OPENSSL_sk_insert(ossl_check_USERNOTICE_sk_type(sk), ossl_check_USERNOTICE_type(ptr), (idx)) +#define sk_USERNOTICE_set(sk, idx, ptr) ((USERNOTICE *)OPENSSL_sk_set(ossl_check_USERNOTICE_sk_type(sk), (idx), ossl_check_USERNOTICE_type(ptr))) +#define sk_USERNOTICE_find(sk, ptr) OPENSSL_sk_find(ossl_check_USERNOTICE_sk_type(sk), ossl_check_USERNOTICE_type(ptr)) +#define sk_USERNOTICE_find_ex(sk, ptr) OPENSSL_sk_find_ex(ossl_check_USERNOTICE_sk_type(sk), ossl_check_USERNOTICE_type(ptr)) +#define sk_USERNOTICE_find_all(sk, ptr, pnum) OPENSSL_sk_find_all(ossl_check_USERNOTICE_sk_type(sk), ossl_check_USERNOTICE_type(ptr), pnum) +#define sk_USERNOTICE_sort(sk) OPENSSL_sk_sort(ossl_check_USERNOTICE_sk_type(sk)) +#define sk_USERNOTICE_is_sorted(sk) OPENSSL_sk_is_sorted(ossl_check_const_USERNOTICE_sk_type(sk)) +#define sk_USERNOTICE_dup(sk) ((STACK_OF(USERNOTICE) *)OPENSSL_sk_dup(ossl_check_const_USERNOTICE_sk_type(sk))) +#define sk_USERNOTICE_deep_copy(sk, copyfunc, freefunc) ((STACK_OF(USERNOTICE) *)OPENSSL_sk_deep_copy(ossl_check_const_USERNOTICE_sk_type(sk), ossl_check_USERNOTICE_copyfunc_type(copyfunc), ossl_check_USERNOTICE_freefunc_type(freefunc))) +#define sk_USERNOTICE_set_cmp_func(sk, cmp) ((sk_USERNOTICE_compfunc)OPENSSL_sk_set_cmp_func(ossl_check_USERNOTICE_sk_type(sk), ossl_check_USERNOTICE_compfunc_type(cmp))) + + # ifdef __cplusplus } # endif diff --git a/CryptoPkg/Library/OpensslLib/OpensslLib.inf b/CryptoPkg/Library/OpensslLib/OpensslLib.inf index ebdeae18c74c..63fce8d41f39 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslLib.inf +++ b/CryptoPkg/Library/OpensslLib/OpensslLib.inf @@ -128,6 +128,7 @@ $(OPENSSL_PATH)/crypto/bio/bss_conn.c $(OPENSSL_PATH)/crypto/bio/bss_core.c $(OPENSSL_PATH)/crypto/bio/bss_dgram.c + $(OPENSSL_PATH)/crypto/bio/bss_dgram_pair.c $(OPENSSL_PATH)/crypto/bio/bss_fd.c $(OPENSSL_PATH)/crypto/bio/bss_file.c $(OPENSSL_PATH)/crypto/bio/bss_log.c @@ -169,7 +170,9 @@ $(OPENSSL_PATH)/crypto/bn/bn_x931p.c $(OPENSSL_PATH)/crypto/buffer/buf_err.c $(OPENSSL_PATH)/crypto/buffer/buffer.c + $(OPENSSL_PATH)/crypto/comp/c_brotli.c $(OPENSSL_PATH)/crypto/comp/c_zlib.c + $(OPENSSL_PATH)/crypto/comp/c_zstd.c $(OPENSSL_PATH)/crypto/comp/comp_err.c $(OPENSSL_PATH)/crypto/comp/comp_lib.c $(OPENSSL_PATH)/crypto/conf/conf_api.c @@ -209,7 +212,9 @@ $(OPENSSL_PATH)/crypto/err/err_all.c $(OPENSSL_PATH)/crypto/err/err_all_legacy.c $(OPENSSL_PATH)/crypto/err/err_blocks.c + $(OPENSSL_PATH)/crypto/err/err_mark.c $(OPENSSL_PATH)/crypto/err/err_prn.c + $(OPENSSL_PATH)/crypto/err/err_save.c $(OPENSSL_PATH)/crypto/ess/ess_asn1.c $(OPENSSL_PATH)/crypto/ess/ess_err.c $(OPENSSL_PATH)/crypto/ess/ess_lib.c @@ -292,7 +297,10 @@ $(OPENSSL_PATH)/crypto/ffc/ffc_params.c $(OPENSSL_PATH)/crypto/ffc/ffc_params_generate.c $(OPENSSL_PATH)/crypto/ffc/ffc_params_validate.c + $(OPENSSL_PATH)/crypto/hashtable/hashtable.c $(OPENSSL_PATH)/crypto/hmac/hmac.c + $(OPENSSL_PATH)/crypto/hpke/hpke.c + $(OPENSSL_PATH)/crypto/hpke/hpke_util.c $(OPENSSL_PATH)/crypto/http/http_client.c $(OPENSSL_PATH)/crypto/http/http_err.c $(OPENSSL_PATH)/crypto/http/http_lib.c @@ -301,6 +309,7 @@ $(OPENSSL_PATH)/crypto/lhash/lhash.c $(OPENSSL_PATH)/crypto/asn1_dsa.c $(OPENSSL_PATH)/crypto/bsearch.c + $(OPENSSL_PATH)/crypto/comp_methods.c $(OPENSSL_PATH)/crypto/context.c $(OPENSSL_PATH)/crypto/core_algorithm.c $(OPENSSL_PATH)/crypto/core_fetch.c @@ -310,10 +319,13 @@ $(OPENSSL_PATH)/crypto/cryptlib.c $(OPENSSL_PATH)/crypto/ctype.c $(OPENSSL_PATH)/crypto/cversion.c + $(OPENSSL_PATH)/crypto/defaults.c $(OPENSSL_PATH)/crypto/der_writer.c + $(OPENSSL_PATH)/crypto/deterministic_nonce.c $(OPENSSL_PATH)/crypto/ebcdic.c $(OPENSSL_PATH)/crypto/ex_data.c $(OPENSSL_PATH)/crypto/getenv.c + $(OPENSSL_PATH)/crypto/indicator_core.c $(OPENSSL_PATH)/crypto/info.c $(OPENSSL_PATH)/crypto/init.c $(OPENSSL_PATH)/crypto/initthread.c @@ -337,12 +349,15 @@ $(OPENSSL_PATH)/crypto/provider_conf.c $(OPENSSL_PATH)/crypto/provider_core.c $(OPENSSL_PATH)/crypto/punycode.c + $(OPENSSL_PATH)/crypto/quic_vlint.c $(OPENSSL_PATH)/crypto/self_test_core.c + $(OPENSSL_PATH)/crypto/sleep.c $(OPENSSL_PATH)/crypto/sparse_array.c $(OPENSSL_PATH)/crypto/threads_lib.c $(OPENSSL_PATH)/crypto/threads_none.c $(OPENSSL_PATH)/crypto/threads_pthread.c $(OPENSSL_PATH)/crypto/threads_win.c + $(OPENSSL_PATH)/crypto/time.c $(OPENSSL_PATH)/crypto/trace.c $(OPENSSL_PATH)/crypto/uid.c $(OPENSSL_PATH)/crypto/md5/md5_dgst.c @@ -359,6 +374,7 @@ $(OPENSSL_PATH)/crypto/modes/siv128.c $(OPENSSL_PATH)/crypto/modes/wrap128.c $(OPENSSL_PATH)/crypto/modes/xts128.c + $(OPENSSL_PATH)/crypto/modes/xts128gb.c $(OPENSSL_PATH)/crypto/objects/o_names.c $(OPENSSL_PATH)/crypto/objects/obj_dat.c $(OPENSSL_PATH)/crypto/objects/obj_err.c @@ -395,6 +411,7 @@ $(OPENSSL_PATH)/crypto/rand/rand_lib.c $(OPENSSL_PATH)/crypto/rand/rand_meth.c $(OPENSSL_PATH)/crypto/rand/rand_pool.c + $(OPENSSL_PATH)/crypto/rand/rand_uniform.c $(OPENSSL_PATH)/crypto/rsa/rsa_ameth.c $(OPENSSL_PATH)/crypto/rsa/rsa_asn1.c $(OPENSSL_PATH)/crypto/rsa/rsa_backend.c @@ -429,6 +446,8 @@ $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c $(OPENSSL_PATH)/crypto/sm3/sm3.c $(OPENSSL_PATH)/crypto/stack/stack.c + $(OPENSSL_PATH)/crypto/thread/arch/thread_win.c + $(OPENSSL_PATH)/crypto/thread/api.c $(OPENSSL_PATH)/crypto/txt_db/txt_db.c $(OPENSSL_PATH)/crypto/ui/ui_err.c $(OPENSSL_PATH)/crypto/ui/ui_lib.c @@ -444,14 +463,18 @@ $(OPENSSL_PATH)/crypto/x509/pcy_map.c $(OPENSSL_PATH)/crypto/x509/pcy_node.c $(OPENSSL_PATH)/crypto/x509/pcy_tree.c + $(OPENSSL_PATH)/crypto/x509/t_acert.c $(OPENSSL_PATH)/crypto/x509/t_crl.c $(OPENSSL_PATH)/crypto/x509/t_req.c $(OPENSSL_PATH)/crypto/x509/t_x509.c + $(OPENSSL_PATH)/crypto/x509/v3_ac_tgt.c $(OPENSSL_PATH)/crypto/x509/v3_addr.c $(OPENSSL_PATH)/crypto/x509/v3_admis.c $(OPENSSL_PATH)/crypto/x509/v3_akeya.c $(OPENSSL_PATH)/crypto/x509/v3_akid.c $(OPENSSL_PATH)/crypto/x509/v3_asid.c + $(OPENSSL_PATH)/crypto/x509/v3_audit_id.c + $(OPENSSL_PATH)/crypto/x509/v3_battcons.c $(OPENSSL_PATH)/crypto/x509/v3_bcons.c $(OPENSSL_PATH)/crypto/x509/v3_bitst.c $(OPENSSL_PATH)/crypto/x509/v3_conf.c @@ -460,12 +483,17 @@ $(OPENSSL_PATH)/crypto/x509/v3_enum.c $(OPENSSL_PATH)/crypto/x509/v3_extku.c $(OPENSSL_PATH)/crypto/x509/v3_genn.c + $(OPENSSL_PATH)/crypto/x509/v3_group_ac.c $(OPENSSL_PATH)/crypto/x509/v3_ia5.c + $(OPENSSL_PATH)/crypto/x509/v3_ind_iss.c $(OPENSSL_PATH)/crypto/x509/v3_info.c $(OPENSSL_PATH)/crypto/x509/v3_int.c + $(OPENSSL_PATH)/crypto/x509/v3_iobo.c $(OPENSSL_PATH)/crypto/x509/v3_ist.c $(OPENSSL_PATH)/crypto/x509/v3_lib.c $(OPENSSL_PATH)/crypto/x509/v3_ncons.c + $(OPENSSL_PATH)/crypto/x509/v3_no_ass.c + $(OPENSSL_PATH)/crypto/x509/v3_no_rev_avail.c $(OPENSSL_PATH)/crypto/x509/v3_pci.c $(OPENSSL_PATH)/crypto/x509/v3_pcia.c $(OPENSSL_PATH)/crypto/x509/v3_pcons.c @@ -474,12 +502,17 @@ $(OPENSSL_PATH)/crypto/x509/v3_prn.c $(OPENSSL_PATH)/crypto/x509/v3_purp.c $(OPENSSL_PATH)/crypto/x509/v3_san.c + $(OPENSSL_PATH)/crypto/x509/v3_sda.c + $(OPENSSL_PATH)/crypto/x509/v3_single_use.c $(OPENSSL_PATH)/crypto/x509/v3_skid.c + $(OPENSSL_PATH)/crypto/x509/v3_soa_id.c $(OPENSSL_PATH)/crypto/x509/v3_sxnet.c $(OPENSSL_PATH)/crypto/x509/v3_tlsf.c + $(OPENSSL_PATH)/crypto/x509/v3_usernotice.c $(OPENSSL_PATH)/crypto/x509/v3_utf8.c $(OPENSSL_PATH)/crypto/x509/v3_utl.c $(OPENSSL_PATH)/crypto/x509/v3err.c + $(OPENSSL_PATH)/crypto/x509/x509_acert.c $(OPENSSL_PATH)/crypto/x509/x509_att.c $(OPENSSL_PATH)/crypto/x509/x509_cmp.c $(OPENSSL_PATH)/crypto/x509/x509_d2.c @@ -497,6 +530,7 @@ $(OPENSSL_PATH)/crypto/x509/x509_v3.c $(OPENSSL_PATH)/crypto/x509/x509_vfy.c $(OPENSSL_PATH)/crypto/x509/x509_vpm.c + $(OPENSSL_PATH)/crypto/x509/x509aset.c $(OPENSSL_PATH)/crypto/x509/x509cset.c $(OPENSSL_PATH)/crypto/x509/x509name.c $(OPENSSL_PATH)/crypto/x509/x509rset.c @@ -506,6 +540,7 @@ $(OPENSSL_PATH)/crypto/x509/x_attrib.c $(OPENSSL_PATH)/crypto/x509/x_crl.c $(OPENSSL_PATH)/crypto/x509/x_exten.c + $(OPENSSL_PATH)/crypto/x509/x_ietfatt.c $(OPENSSL_PATH)/crypto/x509/x_name.c $(OPENSSL_PATH)/crypto/x509/x_pubkey.c $(OPENSSL_PATH)/crypto/x509/x_req.c @@ -552,7 +587,9 @@ $(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c $(OPENSSL_PATH)/providers/implementations/exchange/dh_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c + $(OPENSSL_PATH)/providers/implementations/kdfs/argon2.c $(OPENSSL_PATH)/providers/implementations/kdfs/hkdf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/hmacdrbg_kdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/kbkdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/krb5kdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/pbkdf2.c @@ -571,12 +608,12 @@ $(OPENSSL_PATH)/providers/implementations/macs/gmac_prov.c $(OPENSSL_PATH)/providers/implementations/macs/hmac_prov.c $(OPENSSL_PATH)/providers/implementations/macs/kmac_prov.c - $(OPENSSL_PATH)/providers/implementations/rands/crngt.c $(OPENSSL_PATH)/providers/implementations/rands/drbg.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_ctr.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_hash.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_hmac.c $(OPENSSL_PATH)/providers/implementations/rands/seed_src.c + $(OPENSSL_PATH)/providers/implementations/rands/seed_src_jitter.c $(OPENSSL_PATH)/providers/implementations/rands/test_rng.c $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_cpu_x86.c $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_tsc.c @@ -584,7 +621,8 @@ $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_win.c $(OPENSSL_PATH)/providers/implementations/signature/mac_legacy_sig.c $(OPENSSL_PATH)/providers/implementations/signature/rsa_sig.c - $(OPENSSL_PATH)/ssl/s3_cbc.c + $(OPENSSL_PATH)/ssl/record/methods/ssl3_cbc.c + $(OPENSSL_GEN_PATH)/crypto/params_idx.c $(OPENSSL_PATH)/providers/common/der/der_rsa_key.c $(OPENSSL_PATH)/providers/common/provider_ctx.c $(OPENSSL_PATH)/providers/common/provider_err.c @@ -596,7 +634,7 @@ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_gcm_hw.c $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_hw.c $(OPENSSL_PATH)/providers/implementations/digests/digestcommon.c - $(OPENSSL_PATH)/ssl/record/tls_pad.c + $(OPENSSL_PATH)/ssl/record/methods/tls_pad.c $(OPENSSL_GEN_PATH)/providers/common/der/der_digests_gen.c $(OPENSSL_GEN_PATH)/providers/common/der/der_rsa_gen.c $(OPENSSL_GEN_PATH)/providers/common/der/der_wrap_gen.c @@ -611,6 +649,7 @@ $(OPENSSL_PATH)/ssl/s3_msg.c $(OPENSSL_PATH)/ssl/ssl_asn1.c $(OPENSSL_PATH)/ssl/ssl_cert.c + $(OPENSSL_PATH)/ssl/ssl_cert_comp.c $(OPENSSL_PATH)/ssl/ssl_ciph.c $(OPENSSL_PATH)/ssl/ssl_conf.c $(OPENSSL_PATH)/ssl/ssl_err.c @@ -630,12 +669,16 @@ $(OPENSSL_PATH)/ssl/tls13_enc.c $(OPENSSL_PATH)/ssl/tls_depr.c $(OPENSSL_PATH)/ssl/tls_srp.c - $(OPENSSL_PATH)/ssl/record/dtls1_bitmap.c $(OPENSSL_PATH)/ssl/record/rec_layer_d1.c $(OPENSSL_PATH)/ssl/record/rec_layer_s3.c - $(OPENSSL_PATH)/ssl/record/ssl3_buffer.c - $(OPENSSL_PATH)/ssl/record/ssl3_record.c - $(OPENSSL_PATH)/ssl/record/ssl3_record_tls13.c + $(OPENSSL_PATH)/ssl/record/methods/dtls_meth.c + $(OPENSSL_PATH)/ssl/record/methods/ssl3_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls13_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls1_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls_common.c + $(OPENSSL_PATH)/ssl/record/methods/tls_multib.c + $(OPENSSL_PATH)/ssl/record/methods/tlsany_meth.c + $(OPENSSL_PATH)/ssl/rio/poll_immediate.c $(OPENSSL_PATH)/ssl/statem/extensions.c $(OPENSSL_PATH)/ssl/statem/extensions_clnt.c $(OPENSSL_PATH)/ssl/statem/extensions_cust.c @@ -687,16 +730,16 @@ # C4819: The file contains a character that cannot be represented in the current code page # C4133: incompatible types - from 'ASN1_TYPE *' to 'const ASN1_STRING *' (v3_genn.c(101)) # - MSFT:*_*_IA32_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_NOASM) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4310 /wd4389 /wd4700 /wd4702 /wd4706 /wd4819 /wd4133 - MSFT:*_*_X64_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_NOASM) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4306 /wd4310 /wd4700 /wd4389 /wd4702 /wd4706 /wd4819 /wd4133 + MSFT:*_*_IA32_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_NOASM) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4310 /wd4389 /wd4700 /wd4702 /wd4706 /wd4819 /wd4133 /wd4701 /wd4703 /wd4189 + MSFT:*_*_X64_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_NOASM) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4306 /wd4310 /wd4700 /wd4389 /wd4702 /wd4706 /wd4819 /wd4133 /wd4701 /wd4703 /wd4189 # # Disable following Visual Studio 2015 compiler warnings brought by openssl source, # so we do not break the build with /WX option: # C4718: recursive call has no side effects, deleting # - MSFT:*_VS2015x86_IA32_CC_FLAGS = /wd4718 - MSFT:*_VS2015x86_X64_CC_FLAGS = /wd4718 + MSFT:*_VS2015x86_IA32_CC_FLAGS = /wd4718 /wd4701 /wd4703 /wd4189 + MSFT:*_VS2015x86_X64_CC_FLAGS = /wd4718 /wd4701 /wd4703 /wd4189 INTEL:*_*_IA32_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER -U__ICC $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_NOASM) /w INTEL:*_*_X64_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER -U__ICC $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_NOASM) /w diff --git a/CryptoPkg/Library/OpensslLib/OpensslLibAccel.inf b/CryptoPkg/Library/OpensslLib/OpensslLibAccel.inf index 7ef206ec1695..901fdf5f19ba 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslLibAccel.inf +++ b/CryptoPkg/Library/OpensslLib/OpensslLibAccel.inf @@ -25,7 +25,7 @@ DEFINE OPENSSL_FLAGS = -DL_ENDIAN -DOPENSSL_SMALL_FOOTPRINT -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE -DEDK2_OPENSSL_NOEC=1 DEFINE OPENSSL_FLAGS_IA32 = -DAES_ASM -DGHASH_ASM -DMD5_ASM -DOPENSSL_CPUID_OBJ -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM -DVPAES_ASM DEFINE OPENSSL_FLAGS_X64 = -DAES_ASM -DBSAES_ASM -DGHASH_ASM -DKECCAK1600_ASM -DMD5_ASM -DOPENSSL_CPUID_OBJ -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM -DVPAES_ASM - DEFINE OPENSSL_FLAGS_AARCH64 = -DKECCAK1600_ASM -DOPENSSL_CPUID_OBJ -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM -DVPAES_ASM + DEFINE OPENSSL_FLAGS_AARCH64 = -DBSAES_ASM -DKECCAK1600_ASM -DMD5_ASM -DOPENSSL_CPUID_OBJ -DOPENSSL_SM3_ASM -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM -DVPAES_ASM # # VALID_ARCHITECTURES = IA32 X64 AARCH64 @@ -146,6 +146,7 @@ $(OPENSSL_PATH)/crypto/bio/bss_conn.c $(OPENSSL_PATH)/crypto/bio/bss_core.c $(OPENSSL_PATH)/crypto/bio/bss_dgram.c + $(OPENSSL_PATH)/crypto/bio/bss_dgram_pair.c $(OPENSSL_PATH)/crypto/bio/bss_fd.c $(OPENSSL_PATH)/crypto/bio/bss_file.c $(OPENSSL_PATH)/crypto/bio/bss_log.c @@ -186,7 +187,9 @@ $(OPENSSL_PATH)/crypto/bn/bn_x931p.c $(OPENSSL_PATH)/crypto/buffer/buf_err.c $(OPENSSL_PATH)/crypto/buffer/buffer.c + $(OPENSSL_PATH)/crypto/comp/c_brotli.c $(OPENSSL_PATH)/crypto/comp/c_zlib.c + $(OPENSSL_PATH)/crypto/comp/c_zstd.c $(OPENSSL_PATH)/crypto/comp/comp_err.c $(OPENSSL_PATH)/crypto/comp/comp_lib.c $(OPENSSL_PATH)/crypto/conf/conf_api.c @@ -226,7 +229,9 @@ $(OPENSSL_PATH)/crypto/err/err_all.c $(OPENSSL_PATH)/crypto/err/err_all_legacy.c $(OPENSSL_PATH)/crypto/err/err_blocks.c + $(OPENSSL_PATH)/crypto/err/err_mark.c $(OPENSSL_PATH)/crypto/err/err_prn.c + $(OPENSSL_PATH)/crypto/err/err_save.c $(OPENSSL_PATH)/crypto/ess/ess_asn1.c $(OPENSSL_PATH)/crypto/ess/ess_err.c $(OPENSSL_PATH)/crypto/ess/ess_lib.c @@ -309,7 +314,10 @@ $(OPENSSL_PATH)/crypto/ffc/ffc_params.c $(OPENSSL_PATH)/crypto/ffc/ffc_params_generate.c $(OPENSSL_PATH)/crypto/ffc/ffc_params_validate.c + $(OPENSSL_PATH)/crypto/hashtable/hashtable.c $(OPENSSL_PATH)/crypto/hmac/hmac.c + $(OPENSSL_PATH)/crypto/hpke/hpke.c + $(OPENSSL_PATH)/crypto/hpke/hpke_util.c $(OPENSSL_PATH)/crypto/http/http_client.c $(OPENSSL_PATH)/crypto/http/http_err.c $(OPENSSL_PATH)/crypto/http/http_lib.c @@ -318,6 +326,7 @@ $(OPENSSL_PATH)/crypto/lhash/lhash.c $(OPENSSL_PATH)/crypto/asn1_dsa.c $(OPENSSL_PATH)/crypto/bsearch.c + $(OPENSSL_PATH)/crypto/comp_methods.c $(OPENSSL_PATH)/crypto/context.c $(OPENSSL_PATH)/crypto/core_algorithm.c $(OPENSSL_PATH)/crypto/core_fetch.c @@ -327,10 +336,13 @@ $(OPENSSL_PATH)/crypto/cryptlib.c $(OPENSSL_PATH)/crypto/ctype.c $(OPENSSL_PATH)/crypto/cversion.c + $(OPENSSL_PATH)/crypto/defaults.c $(OPENSSL_PATH)/crypto/der_writer.c + $(OPENSSL_PATH)/crypto/deterministic_nonce.c $(OPENSSL_PATH)/crypto/ebcdic.c $(OPENSSL_PATH)/crypto/ex_data.c $(OPENSSL_PATH)/crypto/getenv.c + $(OPENSSL_PATH)/crypto/indicator_core.c $(OPENSSL_PATH)/crypto/info.c $(OPENSSL_PATH)/crypto/init.c $(OPENSSL_PATH)/crypto/initthread.c @@ -353,12 +365,15 @@ $(OPENSSL_PATH)/crypto/provider_conf.c $(OPENSSL_PATH)/crypto/provider_core.c $(OPENSSL_PATH)/crypto/punycode.c + $(OPENSSL_PATH)/crypto/quic_vlint.c $(OPENSSL_PATH)/crypto/self_test_core.c + $(OPENSSL_PATH)/crypto/sleep.c $(OPENSSL_PATH)/crypto/sparse_array.c $(OPENSSL_PATH)/crypto/threads_lib.c $(OPENSSL_PATH)/crypto/threads_none.c $(OPENSSL_PATH)/crypto/threads_pthread.c $(OPENSSL_PATH)/crypto/threads_win.c + $(OPENSSL_PATH)/crypto/time.c $(OPENSSL_PATH)/crypto/trace.c $(OPENSSL_PATH)/crypto/uid.c $(OPENSSL_PATH)/crypto/md5/md5_dgst.c @@ -375,6 +390,7 @@ $(OPENSSL_PATH)/crypto/modes/siv128.c $(OPENSSL_PATH)/crypto/modes/wrap128.c $(OPENSSL_PATH)/crypto/modes/xts128.c + $(OPENSSL_PATH)/crypto/modes/xts128gb.c $(OPENSSL_PATH)/crypto/objects/o_names.c $(OPENSSL_PATH)/crypto/objects/obj_dat.c $(OPENSSL_PATH)/crypto/objects/obj_err.c @@ -411,6 +427,7 @@ $(OPENSSL_PATH)/crypto/rand/rand_lib.c $(OPENSSL_PATH)/crypto/rand/rand_meth.c $(OPENSSL_PATH)/crypto/rand/rand_pool.c + $(OPENSSL_PATH)/crypto/rand/rand_uniform.c $(OPENSSL_PATH)/crypto/rsa/rsa_ameth.c $(OPENSSL_PATH)/crypto/rsa/rsa_asn1.c $(OPENSSL_PATH)/crypto/rsa/rsa_backend.c @@ -445,6 +462,8 @@ $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c $(OPENSSL_PATH)/crypto/sm3/sm3.c $(OPENSSL_PATH)/crypto/stack/stack.c + $(OPENSSL_PATH)/crypto/thread/arch/thread_win.c + $(OPENSSL_PATH)/crypto/thread/api.c $(OPENSSL_PATH)/crypto/txt_db/txt_db.c $(OPENSSL_PATH)/crypto/ui/ui_err.c $(OPENSSL_PATH)/crypto/ui/ui_lib.c @@ -460,14 +479,18 @@ $(OPENSSL_PATH)/crypto/x509/pcy_map.c $(OPENSSL_PATH)/crypto/x509/pcy_node.c $(OPENSSL_PATH)/crypto/x509/pcy_tree.c + $(OPENSSL_PATH)/crypto/x509/t_acert.c $(OPENSSL_PATH)/crypto/x509/t_crl.c $(OPENSSL_PATH)/crypto/x509/t_req.c $(OPENSSL_PATH)/crypto/x509/t_x509.c + $(OPENSSL_PATH)/crypto/x509/v3_ac_tgt.c $(OPENSSL_PATH)/crypto/x509/v3_addr.c $(OPENSSL_PATH)/crypto/x509/v3_admis.c $(OPENSSL_PATH)/crypto/x509/v3_akeya.c $(OPENSSL_PATH)/crypto/x509/v3_akid.c $(OPENSSL_PATH)/crypto/x509/v3_asid.c + $(OPENSSL_PATH)/crypto/x509/v3_audit_id.c + $(OPENSSL_PATH)/crypto/x509/v3_battcons.c $(OPENSSL_PATH)/crypto/x509/v3_bcons.c $(OPENSSL_PATH)/crypto/x509/v3_bitst.c $(OPENSSL_PATH)/crypto/x509/v3_conf.c @@ -476,12 +499,17 @@ $(OPENSSL_PATH)/crypto/x509/v3_enum.c $(OPENSSL_PATH)/crypto/x509/v3_extku.c $(OPENSSL_PATH)/crypto/x509/v3_genn.c + $(OPENSSL_PATH)/crypto/x509/v3_group_ac.c $(OPENSSL_PATH)/crypto/x509/v3_ia5.c + $(OPENSSL_PATH)/crypto/x509/v3_ind_iss.c $(OPENSSL_PATH)/crypto/x509/v3_info.c $(OPENSSL_PATH)/crypto/x509/v3_int.c + $(OPENSSL_PATH)/crypto/x509/v3_iobo.c $(OPENSSL_PATH)/crypto/x509/v3_ist.c $(OPENSSL_PATH)/crypto/x509/v3_lib.c $(OPENSSL_PATH)/crypto/x509/v3_ncons.c + $(OPENSSL_PATH)/crypto/x509/v3_no_ass.c + $(OPENSSL_PATH)/crypto/x509/v3_no_rev_avail.c $(OPENSSL_PATH)/crypto/x509/v3_pci.c $(OPENSSL_PATH)/crypto/x509/v3_pcia.c $(OPENSSL_PATH)/crypto/x509/v3_pcons.c @@ -490,12 +518,17 @@ $(OPENSSL_PATH)/crypto/x509/v3_prn.c $(OPENSSL_PATH)/crypto/x509/v3_purp.c $(OPENSSL_PATH)/crypto/x509/v3_san.c + $(OPENSSL_PATH)/crypto/x509/v3_sda.c + $(OPENSSL_PATH)/crypto/x509/v3_single_use.c $(OPENSSL_PATH)/crypto/x509/v3_skid.c + $(OPENSSL_PATH)/crypto/x509/v3_soa_id.c $(OPENSSL_PATH)/crypto/x509/v3_sxnet.c $(OPENSSL_PATH)/crypto/x509/v3_tlsf.c + $(OPENSSL_PATH)/crypto/x509/v3_usernotice.c $(OPENSSL_PATH)/crypto/x509/v3_utf8.c $(OPENSSL_PATH)/crypto/x509/v3_utl.c $(OPENSSL_PATH)/crypto/x509/v3err.c + $(OPENSSL_PATH)/crypto/x509/x509_acert.c $(OPENSSL_PATH)/crypto/x509/x509_att.c $(OPENSSL_PATH)/crypto/x509/x509_cmp.c $(OPENSSL_PATH)/crypto/x509/x509_d2.c @@ -513,6 +546,7 @@ $(OPENSSL_PATH)/crypto/x509/x509_v3.c $(OPENSSL_PATH)/crypto/x509/x509_vfy.c $(OPENSSL_PATH)/crypto/x509/x509_vpm.c + $(OPENSSL_PATH)/crypto/x509/x509aset.c $(OPENSSL_PATH)/crypto/x509/x509cset.c $(OPENSSL_PATH)/crypto/x509/x509name.c $(OPENSSL_PATH)/crypto/x509/x509rset.c @@ -522,6 +556,7 @@ $(OPENSSL_PATH)/crypto/x509/x_attrib.c $(OPENSSL_PATH)/crypto/x509/x_crl.c $(OPENSSL_PATH)/crypto/x509/x_exten.c + $(OPENSSL_PATH)/crypto/x509/x_ietfatt.c $(OPENSSL_PATH)/crypto/x509/x_name.c $(OPENSSL_PATH)/crypto/x509/x_pubkey.c $(OPENSSL_PATH)/crypto/x509/x_req.c @@ -568,7 +603,9 @@ $(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c $(OPENSSL_PATH)/providers/implementations/exchange/dh_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c + $(OPENSSL_PATH)/providers/implementations/kdfs/argon2.c $(OPENSSL_PATH)/providers/implementations/kdfs/hkdf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/hmacdrbg_kdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/kbkdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/krb5kdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/pbkdf2.c @@ -587,12 +624,12 @@ $(OPENSSL_PATH)/providers/implementations/macs/gmac_prov.c $(OPENSSL_PATH)/providers/implementations/macs/hmac_prov.c $(OPENSSL_PATH)/providers/implementations/macs/kmac_prov.c - $(OPENSSL_PATH)/providers/implementations/rands/crngt.c $(OPENSSL_PATH)/providers/implementations/rands/drbg.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_ctr.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_hash.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_hmac.c $(OPENSSL_PATH)/providers/implementations/rands/seed_src.c + $(OPENSSL_PATH)/providers/implementations/rands/seed_src_jitter.c $(OPENSSL_PATH)/providers/implementations/rands/test_rng.c $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_cpu_x86.c $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_tsc.c @@ -600,7 +637,8 @@ $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_win.c $(OPENSSL_PATH)/providers/implementations/signature/mac_legacy_sig.c $(OPENSSL_PATH)/providers/implementations/signature/rsa_sig.c - $(OPENSSL_PATH)/ssl/s3_cbc.c + $(OPENSSL_PATH)/ssl/record/methods/ssl3_cbc.c + $(OPENSSL_GEN_PATH)/crypto/params_idx.c $(OPENSSL_PATH)/providers/common/der/der_rsa_key.c $(OPENSSL_PATH)/providers/common/provider_ctx.c $(OPENSSL_PATH)/providers/common/provider_err.c @@ -612,7 +650,7 @@ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_gcm_hw.c $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_hw.c $(OPENSSL_PATH)/providers/implementations/digests/digestcommon.c - $(OPENSSL_PATH)/ssl/record/tls_pad.c + $(OPENSSL_PATH)/ssl/record/methods/tls_pad.c $(OPENSSL_GEN_PATH)/providers/common/der/der_digests_gen.c $(OPENSSL_GEN_PATH)/providers/common/der/der_rsa_gen.c $(OPENSSL_GEN_PATH)/providers/common/der/der_wrap_gen.c @@ -627,6 +665,7 @@ $(OPENSSL_PATH)/ssl/s3_msg.c $(OPENSSL_PATH)/ssl/ssl_asn1.c $(OPENSSL_PATH)/ssl/ssl_cert.c + $(OPENSSL_PATH)/ssl/ssl_cert_comp.c $(OPENSSL_PATH)/ssl/ssl_ciph.c $(OPENSSL_PATH)/ssl/ssl_conf.c $(OPENSSL_PATH)/ssl/ssl_err.c @@ -646,12 +685,16 @@ $(OPENSSL_PATH)/ssl/tls13_enc.c $(OPENSSL_PATH)/ssl/tls_depr.c $(OPENSSL_PATH)/ssl/tls_srp.c - $(OPENSSL_PATH)/ssl/record/dtls1_bitmap.c $(OPENSSL_PATH)/ssl/record/rec_layer_d1.c $(OPENSSL_PATH)/ssl/record/rec_layer_s3.c - $(OPENSSL_PATH)/ssl/record/ssl3_buffer.c - $(OPENSSL_PATH)/ssl/record/ssl3_record.c - $(OPENSSL_PATH)/ssl/record/ssl3_record_tls13.c + $(OPENSSL_PATH)/ssl/record/methods/dtls_meth.c + $(OPENSSL_PATH)/ssl/record/methods/ssl3_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls13_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls1_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls_common.c + $(OPENSSL_PATH)/ssl/record/methods/tls_multib.c + $(OPENSSL_PATH)/ssl/record/methods/tlsany_meth.c + $(OPENSSL_PATH)/ssl/rio/poll_immediate.c $(OPENSSL_PATH)/ssl/statem/extensions.c $(OPENSSL_PATH)/ssl/statem/extensions_clnt.c $(OPENSSL_PATH)/ssl/statem/extensions_cust.c @@ -777,6 +820,7 @@ $(OPENSSL_PATH)/crypto/bio/bss_conn.c $(OPENSSL_PATH)/crypto/bio/bss_core.c $(OPENSSL_PATH)/crypto/bio/bss_dgram.c + $(OPENSSL_PATH)/crypto/bio/bss_dgram_pair.c $(OPENSSL_PATH)/crypto/bio/bss_fd.c $(OPENSSL_PATH)/crypto/bio/bss_file.c $(OPENSSL_PATH)/crypto/bio/bss_log.c @@ -819,7 +863,9 @@ $(OPENSSL_PATH)/crypto/bn/rsaz_exp_x2.c $(OPENSSL_PATH)/crypto/buffer/buf_err.c $(OPENSSL_PATH)/crypto/buffer/buffer.c + $(OPENSSL_PATH)/crypto/comp/c_brotli.c $(OPENSSL_PATH)/crypto/comp/c_zlib.c + $(OPENSSL_PATH)/crypto/comp/c_zstd.c $(OPENSSL_PATH)/crypto/comp/comp_err.c $(OPENSSL_PATH)/crypto/comp/comp_lib.c $(OPENSSL_PATH)/crypto/conf/conf_api.c @@ -859,7 +905,9 @@ $(OPENSSL_PATH)/crypto/err/err_all.c $(OPENSSL_PATH)/crypto/err/err_all_legacy.c $(OPENSSL_PATH)/crypto/err/err_blocks.c + $(OPENSSL_PATH)/crypto/err/err_mark.c $(OPENSSL_PATH)/crypto/err/err_prn.c + $(OPENSSL_PATH)/crypto/err/err_save.c $(OPENSSL_PATH)/crypto/ess/ess_asn1.c $(OPENSSL_PATH)/crypto/ess/ess_err.c $(OPENSSL_PATH)/crypto/ess/ess_lib.c @@ -942,7 +990,10 @@ $(OPENSSL_PATH)/crypto/ffc/ffc_params.c $(OPENSSL_PATH)/crypto/ffc/ffc_params_generate.c $(OPENSSL_PATH)/crypto/ffc/ffc_params_validate.c + $(OPENSSL_PATH)/crypto/hashtable/hashtable.c $(OPENSSL_PATH)/crypto/hmac/hmac.c + $(OPENSSL_PATH)/crypto/hpke/hpke.c + $(OPENSSL_PATH)/crypto/hpke/hpke_util.c $(OPENSSL_PATH)/crypto/http/http_client.c $(OPENSSL_PATH)/crypto/http/http_err.c $(OPENSSL_PATH)/crypto/http/http_lib.c @@ -951,6 +1002,7 @@ $(OPENSSL_PATH)/crypto/lhash/lhash.c $(OPENSSL_PATH)/crypto/asn1_dsa.c $(OPENSSL_PATH)/crypto/bsearch.c + $(OPENSSL_PATH)/crypto/comp_methods.c $(OPENSSL_PATH)/crypto/context.c $(OPENSSL_PATH)/crypto/core_algorithm.c $(OPENSSL_PATH)/crypto/core_fetch.c @@ -960,10 +1012,13 @@ $(OPENSSL_PATH)/crypto/cryptlib.c $(OPENSSL_PATH)/crypto/ctype.c $(OPENSSL_PATH)/crypto/cversion.c + $(OPENSSL_PATH)/crypto/defaults.c $(OPENSSL_PATH)/crypto/der_writer.c + $(OPENSSL_PATH)/crypto/deterministic_nonce.c $(OPENSSL_PATH)/crypto/ebcdic.c $(OPENSSL_PATH)/crypto/ex_data.c $(OPENSSL_PATH)/crypto/getenv.c + $(OPENSSL_PATH)/crypto/indicator_core.c $(OPENSSL_PATH)/crypto/info.c $(OPENSSL_PATH)/crypto/init.c $(OPENSSL_PATH)/crypto/initthread.c @@ -986,12 +1041,15 @@ $(OPENSSL_PATH)/crypto/provider_conf.c $(OPENSSL_PATH)/crypto/provider_core.c $(OPENSSL_PATH)/crypto/punycode.c + $(OPENSSL_PATH)/crypto/quic_vlint.c $(OPENSSL_PATH)/crypto/self_test_core.c + $(OPENSSL_PATH)/crypto/sleep.c $(OPENSSL_PATH)/crypto/sparse_array.c $(OPENSSL_PATH)/crypto/threads_lib.c $(OPENSSL_PATH)/crypto/threads_none.c $(OPENSSL_PATH)/crypto/threads_pthread.c $(OPENSSL_PATH)/crypto/threads_win.c + $(OPENSSL_PATH)/crypto/time.c $(OPENSSL_PATH)/crypto/trace.c $(OPENSSL_PATH)/crypto/uid.c $(OPENSSL_PATH)/crypto/md5/md5_dgst.c @@ -1008,6 +1066,7 @@ $(OPENSSL_PATH)/crypto/modes/siv128.c $(OPENSSL_PATH)/crypto/modes/wrap128.c $(OPENSSL_PATH)/crypto/modes/xts128.c + $(OPENSSL_PATH)/crypto/modes/xts128gb.c $(OPENSSL_PATH)/crypto/objects/o_names.c $(OPENSSL_PATH)/crypto/objects/obj_dat.c $(OPENSSL_PATH)/crypto/objects/obj_err.c @@ -1044,6 +1103,7 @@ $(OPENSSL_PATH)/crypto/rand/rand_lib.c $(OPENSSL_PATH)/crypto/rand/rand_meth.c $(OPENSSL_PATH)/crypto/rand/rand_pool.c + $(OPENSSL_PATH)/crypto/rand/rand_uniform.c $(OPENSSL_PATH)/crypto/rsa/rsa_ameth.c $(OPENSSL_PATH)/crypto/rsa/rsa_asn1.c $(OPENSSL_PATH)/crypto/rsa/rsa_backend.c @@ -1077,6 +1137,8 @@ $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c $(OPENSSL_PATH)/crypto/sm3/sm3.c $(OPENSSL_PATH)/crypto/stack/stack.c + $(OPENSSL_PATH)/crypto/thread/arch/thread_win.c + $(OPENSSL_PATH)/crypto/thread/api.c $(OPENSSL_PATH)/crypto/txt_db/txt_db.c $(OPENSSL_PATH)/crypto/ui/ui_err.c $(OPENSSL_PATH)/crypto/ui/ui_lib.c @@ -1092,14 +1154,18 @@ $(OPENSSL_PATH)/crypto/x509/pcy_map.c $(OPENSSL_PATH)/crypto/x509/pcy_node.c $(OPENSSL_PATH)/crypto/x509/pcy_tree.c + $(OPENSSL_PATH)/crypto/x509/t_acert.c $(OPENSSL_PATH)/crypto/x509/t_crl.c $(OPENSSL_PATH)/crypto/x509/t_req.c $(OPENSSL_PATH)/crypto/x509/t_x509.c + $(OPENSSL_PATH)/crypto/x509/v3_ac_tgt.c $(OPENSSL_PATH)/crypto/x509/v3_addr.c $(OPENSSL_PATH)/crypto/x509/v3_admis.c $(OPENSSL_PATH)/crypto/x509/v3_akeya.c $(OPENSSL_PATH)/crypto/x509/v3_akid.c $(OPENSSL_PATH)/crypto/x509/v3_asid.c + $(OPENSSL_PATH)/crypto/x509/v3_audit_id.c + $(OPENSSL_PATH)/crypto/x509/v3_battcons.c $(OPENSSL_PATH)/crypto/x509/v3_bcons.c $(OPENSSL_PATH)/crypto/x509/v3_bitst.c $(OPENSSL_PATH)/crypto/x509/v3_conf.c @@ -1108,12 +1174,17 @@ $(OPENSSL_PATH)/crypto/x509/v3_enum.c $(OPENSSL_PATH)/crypto/x509/v3_extku.c $(OPENSSL_PATH)/crypto/x509/v3_genn.c + $(OPENSSL_PATH)/crypto/x509/v3_group_ac.c $(OPENSSL_PATH)/crypto/x509/v3_ia5.c + $(OPENSSL_PATH)/crypto/x509/v3_ind_iss.c $(OPENSSL_PATH)/crypto/x509/v3_info.c $(OPENSSL_PATH)/crypto/x509/v3_int.c + $(OPENSSL_PATH)/crypto/x509/v3_iobo.c $(OPENSSL_PATH)/crypto/x509/v3_ist.c $(OPENSSL_PATH)/crypto/x509/v3_lib.c $(OPENSSL_PATH)/crypto/x509/v3_ncons.c + $(OPENSSL_PATH)/crypto/x509/v3_no_ass.c + $(OPENSSL_PATH)/crypto/x509/v3_no_rev_avail.c $(OPENSSL_PATH)/crypto/x509/v3_pci.c $(OPENSSL_PATH)/crypto/x509/v3_pcia.c $(OPENSSL_PATH)/crypto/x509/v3_pcons.c @@ -1122,12 +1193,17 @@ $(OPENSSL_PATH)/crypto/x509/v3_prn.c $(OPENSSL_PATH)/crypto/x509/v3_purp.c $(OPENSSL_PATH)/crypto/x509/v3_san.c + $(OPENSSL_PATH)/crypto/x509/v3_sda.c + $(OPENSSL_PATH)/crypto/x509/v3_single_use.c $(OPENSSL_PATH)/crypto/x509/v3_skid.c + $(OPENSSL_PATH)/crypto/x509/v3_soa_id.c $(OPENSSL_PATH)/crypto/x509/v3_sxnet.c $(OPENSSL_PATH)/crypto/x509/v3_tlsf.c + $(OPENSSL_PATH)/crypto/x509/v3_usernotice.c $(OPENSSL_PATH)/crypto/x509/v3_utf8.c $(OPENSSL_PATH)/crypto/x509/v3_utl.c $(OPENSSL_PATH)/crypto/x509/v3err.c + $(OPENSSL_PATH)/crypto/x509/x509_acert.c $(OPENSSL_PATH)/crypto/x509/x509_att.c $(OPENSSL_PATH)/crypto/x509/x509_cmp.c $(OPENSSL_PATH)/crypto/x509/x509_d2.c @@ -1145,6 +1221,7 @@ $(OPENSSL_PATH)/crypto/x509/x509_v3.c $(OPENSSL_PATH)/crypto/x509/x509_vfy.c $(OPENSSL_PATH)/crypto/x509/x509_vpm.c + $(OPENSSL_PATH)/crypto/x509/x509aset.c $(OPENSSL_PATH)/crypto/x509/x509cset.c $(OPENSSL_PATH)/crypto/x509/x509name.c $(OPENSSL_PATH)/crypto/x509/x509rset.c @@ -1154,6 +1231,7 @@ $(OPENSSL_PATH)/crypto/x509/x_attrib.c $(OPENSSL_PATH)/crypto/x509/x_crl.c $(OPENSSL_PATH)/crypto/x509/x_exten.c + $(OPENSSL_PATH)/crypto/x509/x_ietfatt.c $(OPENSSL_PATH)/crypto/x509/x_name.c $(OPENSSL_PATH)/crypto/x509/x_pubkey.c $(OPENSSL_PATH)/crypto/x509/x_req.c @@ -1200,7 +1278,9 @@ $(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c $(OPENSSL_PATH)/providers/implementations/exchange/dh_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c + $(OPENSSL_PATH)/providers/implementations/kdfs/argon2.c $(OPENSSL_PATH)/providers/implementations/kdfs/hkdf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/hmacdrbg_kdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/kbkdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/krb5kdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/pbkdf2.c @@ -1219,12 +1299,12 @@ $(OPENSSL_PATH)/providers/implementations/macs/gmac_prov.c $(OPENSSL_PATH)/providers/implementations/macs/hmac_prov.c $(OPENSSL_PATH)/providers/implementations/macs/kmac_prov.c - $(OPENSSL_PATH)/providers/implementations/rands/crngt.c $(OPENSSL_PATH)/providers/implementations/rands/drbg.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_ctr.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_hash.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_hmac.c $(OPENSSL_PATH)/providers/implementations/rands/seed_src.c + $(OPENSSL_PATH)/providers/implementations/rands/seed_src_jitter.c $(OPENSSL_PATH)/providers/implementations/rands/test_rng.c $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_cpu_x86.c $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_tsc.c @@ -1232,7 +1312,8 @@ $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_win.c $(OPENSSL_PATH)/providers/implementations/signature/mac_legacy_sig.c $(OPENSSL_PATH)/providers/implementations/signature/rsa_sig.c - $(OPENSSL_PATH)/ssl/s3_cbc.c + $(OPENSSL_PATH)/ssl/record/methods/ssl3_cbc.c + $(OPENSSL_GEN_PATH)/crypto/params_idx.c $(OPENSSL_PATH)/providers/common/der/der_rsa_key.c $(OPENSSL_PATH)/providers/common/provider_ctx.c $(OPENSSL_PATH)/providers/common/provider_err.c @@ -1244,7 +1325,7 @@ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_gcm_hw.c $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_hw.c $(OPENSSL_PATH)/providers/implementations/digests/digestcommon.c - $(OPENSSL_PATH)/ssl/record/tls_pad.c + $(OPENSSL_PATH)/ssl/record/methods/tls_pad.c $(OPENSSL_GEN_PATH)/providers/common/der/der_digests_gen.c $(OPENSSL_GEN_PATH)/providers/common/der/der_rsa_gen.c $(OPENSSL_GEN_PATH)/providers/common/der/der_wrap_gen.c @@ -1259,6 +1340,7 @@ $(OPENSSL_PATH)/ssl/s3_msg.c $(OPENSSL_PATH)/ssl/ssl_asn1.c $(OPENSSL_PATH)/ssl/ssl_cert.c + $(OPENSSL_PATH)/ssl/ssl_cert_comp.c $(OPENSSL_PATH)/ssl/ssl_ciph.c $(OPENSSL_PATH)/ssl/ssl_conf.c $(OPENSSL_PATH)/ssl/ssl_err.c @@ -1278,12 +1360,16 @@ $(OPENSSL_PATH)/ssl/tls13_enc.c $(OPENSSL_PATH)/ssl/tls_depr.c $(OPENSSL_PATH)/ssl/tls_srp.c - $(OPENSSL_PATH)/ssl/record/dtls1_bitmap.c $(OPENSSL_PATH)/ssl/record/rec_layer_d1.c $(OPENSSL_PATH)/ssl/record/rec_layer_s3.c - $(OPENSSL_PATH)/ssl/record/ssl3_buffer.c - $(OPENSSL_PATH)/ssl/record/ssl3_record.c - $(OPENSSL_PATH)/ssl/record/ssl3_record_tls13.c + $(OPENSSL_PATH)/ssl/record/methods/dtls_meth.c + $(OPENSSL_PATH)/ssl/record/methods/ssl3_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls13_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls1_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls_common.c + $(OPENSSL_PATH)/ssl/record/methods/tls_multib.c + $(OPENSSL_PATH)/ssl/record/methods/tlsany_meth.c + $(OPENSSL_PATH)/ssl/rio/poll_immediate.c $(OPENSSL_PATH)/ssl/statem/extensions.c $(OPENSSL_PATH)/ssl/statem/extensions_clnt.c $(OPENSSL_PATH)/ssl/statem/extensions_cust.c @@ -1300,6 +1386,7 @@ $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/aes/vpaes-x86_64.nasm | MSFT $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/x86_64cpuid.nasm | MSFT $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/md5/md5-x86_64.nasm | MSFT + $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/modes/aes-gcm-avx512.nasm | MSFT $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/modes/aesni-gcm-x86_64.nasm | MSFT $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/modes/ghash-x86_64.nasm | MSFT $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/sha/keccak1600-x86_64.nasm | MSFT @@ -1317,6 +1404,7 @@ $(OPENSSL_GEN_PATH)/X64-GCC/crypto/aes/vpaes-x86_64.s | GCC $(OPENSSL_GEN_PATH)/X64-GCC/crypto/x86_64cpuid.s | GCC $(OPENSSL_GEN_PATH)/X64-GCC/crypto/md5/md5-x86_64.s | GCC + $(OPENSSL_GEN_PATH)/X64-GCC/crypto/modes/aes-gcm-avx512.s | GCC $(OPENSSL_GEN_PATH)/X64-GCC/crypto/modes/aesni-gcm-x86_64.s | GCC $(OPENSSL_GEN_PATH)/X64-GCC/crypto/modes/ghash-x86_64.s | GCC $(OPENSSL_GEN_PATH)/X64-GCC/crypto/sha/keccak1600-x86_64.s | GCC @@ -1427,6 +1515,7 @@ $(OPENSSL_PATH)/crypto/bio/bss_conn.c $(OPENSSL_PATH)/crypto/bio/bss_core.c $(OPENSSL_PATH)/crypto/bio/bss_dgram.c + $(OPENSSL_PATH)/crypto/bio/bss_dgram_pair.c $(OPENSSL_PATH)/crypto/bio/bss_fd.c $(OPENSSL_PATH)/crypto/bio/bss_file.c $(OPENSSL_PATH)/crypto/bio/bss_log.c @@ -1468,7 +1557,9 @@ $(OPENSSL_PATH)/crypto/bn/bn_x931p.c $(OPENSSL_PATH)/crypto/buffer/buf_err.c $(OPENSSL_PATH)/crypto/buffer/buffer.c + $(OPENSSL_PATH)/crypto/comp/c_brotli.c $(OPENSSL_PATH)/crypto/comp/c_zlib.c + $(OPENSSL_PATH)/crypto/comp/c_zstd.c $(OPENSSL_PATH)/crypto/comp/comp_err.c $(OPENSSL_PATH)/crypto/comp/comp_lib.c $(OPENSSL_PATH)/crypto/conf/conf_api.c @@ -1508,7 +1599,9 @@ $(OPENSSL_PATH)/crypto/err/err_all.c $(OPENSSL_PATH)/crypto/err/err_all_legacy.c $(OPENSSL_PATH)/crypto/err/err_blocks.c + $(OPENSSL_PATH)/crypto/err/err_mark.c $(OPENSSL_PATH)/crypto/err/err_prn.c + $(OPENSSL_PATH)/crypto/err/err_save.c $(OPENSSL_PATH)/crypto/ess/ess_asn1.c $(OPENSSL_PATH)/crypto/ess/ess_err.c $(OPENSSL_PATH)/crypto/ess/ess_lib.c @@ -1591,7 +1684,10 @@ $(OPENSSL_PATH)/crypto/ffc/ffc_params.c $(OPENSSL_PATH)/crypto/ffc/ffc_params_generate.c $(OPENSSL_PATH)/crypto/ffc/ffc_params_validate.c + $(OPENSSL_PATH)/crypto/hashtable/hashtable.c $(OPENSSL_PATH)/crypto/hmac/hmac.c + $(OPENSSL_PATH)/crypto/hpke/hpke.c + $(OPENSSL_PATH)/crypto/hpke/hpke_util.c $(OPENSSL_PATH)/crypto/http/http_client.c $(OPENSSL_PATH)/crypto/http/http_err.c $(OPENSSL_PATH)/crypto/http/http_lib.c @@ -1600,6 +1696,7 @@ $(OPENSSL_PATH)/crypto/lhash/lhash.c $(OPENSSL_PATH)/crypto/asn1_dsa.c $(OPENSSL_PATH)/crypto/bsearch.c + $(OPENSSL_PATH)/crypto/comp_methods.c $(OPENSSL_PATH)/crypto/context.c $(OPENSSL_PATH)/crypto/core_algorithm.c $(OPENSSL_PATH)/crypto/core_fetch.c @@ -1609,10 +1706,13 @@ $(OPENSSL_PATH)/crypto/cryptlib.c $(OPENSSL_PATH)/crypto/ctype.c $(OPENSSL_PATH)/crypto/cversion.c + $(OPENSSL_PATH)/crypto/defaults.c $(OPENSSL_PATH)/crypto/der_writer.c + $(OPENSSL_PATH)/crypto/deterministic_nonce.c $(OPENSSL_PATH)/crypto/ebcdic.c $(OPENSSL_PATH)/crypto/ex_data.c $(OPENSSL_PATH)/crypto/getenv.c + $(OPENSSL_PATH)/crypto/indicator_core.c $(OPENSSL_PATH)/crypto/info.c $(OPENSSL_PATH)/crypto/init.c $(OPENSSL_PATH)/crypto/initthread.c @@ -1635,12 +1735,15 @@ $(OPENSSL_PATH)/crypto/provider_conf.c $(OPENSSL_PATH)/crypto/provider_core.c $(OPENSSL_PATH)/crypto/punycode.c + $(OPENSSL_PATH)/crypto/quic_vlint.c $(OPENSSL_PATH)/crypto/self_test_core.c + $(OPENSSL_PATH)/crypto/sleep.c $(OPENSSL_PATH)/crypto/sparse_array.c $(OPENSSL_PATH)/crypto/threads_lib.c $(OPENSSL_PATH)/crypto/threads_none.c $(OPENSSL_PATH)/crypto/threads_pthread.c $(OPENSSL_PATH)/crypto/threads_win.c + $(OPENSSL_PATH)/crypto/time.c $(OPENSSL_PATH)/crypto/trace.c $(OPENSSL_PATH)/crypto/uid.c $(OPENSSL_PATH)/crypto/md5/md5_dgst.c @@ -1657,6 +1760,7 @@ $(OPENSSL_PATH)/crypto/modes/siv128.c $(OPENSSL_PATH)/crypto/modes/wrap128.c $(OPENSSL_PATH)/crypto/modes/xts128.c + $(OPENSSL_PATH)/crypto/modes/xts128gb.c $(OPENSSL_PATH)/crypto/objects/o_names.c $(OPENSSL_PATH)/crypto/objects/obj_dat.c $(OPENSSL_PATH)/crypto/objects/obj_err.c @@ -1693,6 +1797,7 @@ $(OPENSSL_PATH)/crypto/rand/rand_lib.c $(OPENSSL_PATH)/crypto/rand/rand_meth.c $(OPENSSL_PATH)/crypto/rand/rand_pool.c + $(OPENSSL_PATH)/crypto/rand/rand_uniform.c $(OPENSSL_PATH)/crypto/rsa/rsa_ameth.c $(OPENSSL_PATH)/crypto/rsa/rsa_asn1.c $(OPENSSL_PATH)/crypto/rsa/rsa_backend.c @@ -1726,6 +1831,8 @@ $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c $(OPENSSL_PATH)/crypto/sm3/sm3.c $(OPENSSL_PATH)/crypto/stack/stack.c + $(OPENSSL_PATH)/crypto/thread/arch/thread_win.c + $(OPENSSL_PATH)/crypto/thread/api.c $(OPENSSL_PATH)/crypto/txt_db/txt_db.c $(OPENSSL_PATH)/crypto/ui/ui_err.c $(OPENSSL_PATH)/crypto/ui/ui_lib.c @@ -1741,14 +1848,18 @@ $(OPENSSL_PATH)/crypto/x509/pcy_map.c $(OPENSSL_PATH)/crypto/x509/pcy_node.c $(OPENSSL_PATH)/crypto/x509/pcy_tree.c + $(OPENSSL_PATH)/crypto/x509/t_acert.c $(OPENSSL_PATH)/crypto/x509/t_crl.c $(OPENSSL_PATH)/crypto/x509/t_req.c $(OPENSSL_PATH)/crypto/x509/t_x509.c + $(OPENSSL_PATH)/crypto/x509/v3_ac_tgt.c $(OPENSSL_PATH)/crypto/x509/v3_addr.c $(OPENSSL_PATH)/crypto/x509/v3_admis.c $(OPENSSL_PATH)/crypto/x509/v3_akeya.c $(OPENSSL_PATH)/crypto/x509/v3_akid.c $(OPENSSL_PATH)/crypto/x509/v3_asid.c + $(OPENSSL_PATH)/crypto/x509/v3_audit_id.c + $(OPENSSL_PATH)/crypto/x509/v3_battcons.c $(OPENSSL_PATH)/crypto/x509/v3_bcons.c $(OPENSSL_PATH)/crypto/x509/v3_bitst.c $(OPENSSL_PATH)/crypto/x509/v3_conf.c @@ -1757,12 +1868,17 @@ $(OPENSSL_PATH)/crypto/x509/v3_enum.c $(OPENSSL_PATH)/crypto/x509/v3_extku.c $(OPENSSL_PATH)/crypto/x509/v3_genn.c + $(OPENSSL_PATH)/crypto/x509/v3_group_ac.c $(OPENSSL_PATH)/crypto/x509/v3_ia5.c + $(OPENSSL_PATH)/crypto/x509/v3_ind_iss.c $(OPENSSL_PATH)/crypto/x509/v3_info.c $(OPENSSL_PATH)/crypto/x509/v3_int.c + $(OPENSSL_PATH)/crypto/x509/v3_iobo.c $(OPENSSL_PATH)/crypto/x509/v3_ist.c $(OPENSSL_PATH)/crypto/x509/v3_lib.c $(OPENSSL_PATH)/crypto/x509/v3_ncons.c + $(OPENSSL_PATH)/crypto/x509/v3_no_ass.c + $(OPENSSL_PATH)/crypto/x509/v3_no_rev_avail.c $(OPENSSL_PATH)/crypto/x509/v3_pci.c $(OPENSSL_PATH)/crypto/x509/v3_pcia.c $(OPENSSL_PATH)/crypto/x509/v3_pcons.c @@ -1771,12 +1887,17 @@ $(OPENSSL_PATH)/crypto/x509/v3_prn.c $(OPENSSL_PATH)/crypto/x509/v3_purp.c $(OPENSSL_PATH)/crypto/x509/v3_san.c + $(OPENSSL_PATH)/crypto/x509/v3_sda.c + $(OPENSSL_PATH)/crypto/x509/v3_single_use.c $(OPENSSL_PATH)/crypto/x509/v3_skid.c + $(OPENSSL_PATH)/crypto/x509/v3_soa_id.c $(OPENSSL_PATH)/crypto/x509/v3_sxnet.c $(OPENSSL_PATH)/crypto/x509/v3_tlsf.c + $(OPENSSL_PATH)/crypto/x509/v3_usernotice.c $(OPENSSL_PATH)/crypto/x509/v3_utf8.c $(OPENSSL_PATH)/crypto/x509/v3_utl.c $(OPENSSL_PATH)/crypto/x509/v3err.c + $(OPENSSL_PATH)/crypto/x509/x509_acert.c $(OPENSSL_PATH)/crypto/x509/x509_att.c $(OPENSSL_PATH)/crypto/x509/x509_cmp.c $(OPENSSL_PATH)/crypto/x509/x509_d2.c @@ -1794,6 +1915,7 @@ $(OPENSSL_PATH)/crypto/x509/x509_v3.c $(OPENSSL_PATH)/crypto/x509/x509_vfy.c $(OPENSSL_PATH)/crypto/x509/x509_vpm.c + $(OPENSSL_PATH)/crypto/x509/x509aset.c $(OPENSSL_PATH)/crypto/x509/x509cset.c $(OPENSSL_PATH)/crypto/x509/x509name.c $(OPENSSL_PATH)/crypto/x509/x509rset.c @@ -1803,6 +1925,7 @@ $(OPENSSL_PATH)/crypto/x509/x_attrib.c $(OPENSSL_PATH)/crypto/x509/x_crl.c $(OPENSSL_PATH)/crypto/x509/x_exten.c + $(OPENSSL_PATH)/crypto/x509/x_ietfatt.c $(OPENSSL_PATH)/crypto/x509/x_name.c $(OPENSSL_PATH)/crypto/x509/x_pubkey.c $(OPENSSL_PATH)/crypto/x509/x_req.c @@ -1849,7 +1972,9 @@ $(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c $(OPENSSL_PATH)/providers/implementations/exchange/dh_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c + $(OPENSSL_PATH)/providers/implementations/kdfs/argon2.c $(OPENSSL_PATH)/providers/implementations/kdfs/hkdf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/hmacdrbg_kdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/kbkdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/krb5kdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/pbkdf2.c @@ -1868,12 +1993,12 @@ $(OPENSSL_PATH)/providers/implementations/macs/gmac_prov.c $(OPENSSL_PATH)/providers/implementations/macs/hmac_prov.c $(OPENSSL_PATH)/providers/implementations/macs/kmac_prov.c - $(OPENSSL_PATH)/providers/implementations/rands/crngt.c $(OPENSSL_PATH)/providers/implementations/rands/drbg.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_ctr.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_hash.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_hmac.c $(OPENSSL_PATH)/providers/implementations/rands/seed_src.c + $(OPENSSL_PATH)/providers/implementations/rands/seed_src_jitter.c $(OPENSSL_PATH)/providers/implementations/rands/test_rng.c $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_cpu_x86.c $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_tsc.c @@ -1881,7 +2006,8 @@ $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_win.c $(OPENSSL_PATH)/providers/implementations/signature/mac_legacy_sig.c $(OPENSSL_PATH)/providers/implementations/signature/rsa_sig.c - $(OPENSSL_PATH)/ssl/s3_cbc.c + $(OPENSSL_PATH)/ssl/record/methods/ssl3_cbc.c + $(OPENSSL_GEN_PATH)/crypto/params_idx.c $(OPENSSL_PATH)/providers/common/der/der_rsa_key.c $(OPENSSL_PATH)/providers/common/provider_ctx.c $(OPENSSL_PATH)/providers/common/provider_err.c @@ -1893,7 +2019,7 @@ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_gcm_hw.c $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_hw.c $(OPENSSL_PATH)/providers/implementations/digests/digestcommon.c - $(OPENSSL_PATH)/ssl/record/tls_pad.c + $(OPENSSL_PATH)/ssl/record/methods/tls_pad.c $(OPENSSL_GEN_PATH)/providers/common/der/der_digests_gen.c $(OPENSSL_GEN_PATH)/providers/common/der/der_rsa_gen.c $(OPENSSL_GEN_PATH)/providers/common/der/der_wrap_gen.c @@ -1908,6 +2034,7 @@ $(OPENSSL_PATH)/ssl/s3_msg.c $(OPENSSL_PATH)/ssl/ssl_asn1.c $(OPENSSL_PATH)/ssl/ssl_cert.c + $(OPENSSL_PATH)/ssl/ssl_cert_comp.c $(OPENSSL_PATH)/ssl/ssl_ciph.c $(OPENSSL_PATH)/ssl/ssl_conf.c $(OPENSSL_PATH)/ssl/ssl_err.c @@ -1927,12 +2054,16 @@ $(OPENSSL_PATH)/ssl/tls13_enc.c $(OPENSSL_PATH)/ssl/tls_depr.c $(OPENSSL_PATH)/ssl/tls_srp.c - $(OPENSSL_PATH)/ssl/record/dtls1_bitmap.c $(OPENSSL_PATH)/ssl/record/rec_layer_d1.c $(OPENSSL_PATH)/ssl/record/rec_layer_s3.c - $(OPENSSL_PATH)/ssl/record/ssl3_buffer.c - $(OPENSSL_PATH)/ssl/record/ssl3_record.c - $(OPENSSL_PATH)/ssl/record/ssl3_record_tls13.c + $(OPENSSL_PATH)/ssl/record/methods/dtls_meth.c + $(OPENSSL_PATH)/ssl/record/methods/ssl3_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls13_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls1_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls_common.c + $(OPENSSL_PATH)/ssl/record/methods/tls_multib.c + $(OPENSSL_PATH)/ssl/record/methods/tlsany_meth.c + $(OPENSSL_PATH)/ssl/rio/poll_immediate.c $(OPENSSL_PATH)/ssl/statem/extensions.c $(OPENSSL_PATH)/ssl/statem/extensions_clnt.c $(OPENSSL_PATH)/ssl/statem/extensions_cust.c @@ -1941,14 +2072,18 @@ $(OPENSSL_PATH)/ssl/statem/statem_dtls.c $(OPENSSL_PATH)/ssl/statem/statem_lib.c $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/aes/aesv8-armx.S | GCC + $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/aes/bsaes-armv8.S | GCC $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/aes/vpaes-armv8.S | GCC $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/arm64cpuid.S | GCC + $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/md5/md5-aarch64.S | GCC + $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/modes/aes-gcm-armv8-unroll8_64.S | GCC $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/modes/aes-gcm-armv8_64.S | GCC $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/modes/ghashv8-armx.S | GCC $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/sha/keccak1600-armv8.S | GCC $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/sha/sha1-armv8.S | GCC $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/sha/sha256-armv8.S | GCC $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/sha/sha512-armv8.S | GCC + $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/sm3/sm3-armv8.S | GCC # Autogenerated files list ends here [Packages] @@ -1979,16 +2114,16 @@ # C4819: The file contains a character that cannot be represented in the current code page # C4133: incompatible types - from 'ASN1_TYPE *' to 'const ASN1_STRING *' (v3_genn.c(101)) # - MSFT:*_*_IA32_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_IA32) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4310 /wd4389 /wd4700 /wd4702 /wd4706 /wd4819 /wd4133 - MSFT:*_*_X64_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_X64) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4306 /wd4310 /wd4700 /wd4389 /wd4702 /wd4706 /wd4819 /wd4133 + MSFT:*_*_IA32_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_IA32) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4310 /wd4389 /wd4700 /wd4702 /wd4706 /wd4819 /wd4133 /wd4701 /wd4703 /wd4189 + MSFT:*_*_X64_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_X64) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4306 /wd4310 /wd4700 /wd4389 /wd4702 /wd4706 /wd4819 /wd4133 /wd4701 /wd4703 /wd4189 # # Disable following Visual Studio 2015 compiler warnings brought by openssl source, # so we do not break the build with /WX option: # C4718: recursive call has no side effects, deleting # - MSFT:*_VS2015x86_IA32_CC_FLAGS = /wd4718 - MSFT:*_VS2015x86_X64_CC_FLAGS = /wd4718 + MSFT:*_VS2015x86_IA32_CC_FLAGS = /wd4718 /wd4701 /wd4703 /wd4189 + MSFT:*_VS2015x86_X64_CC_FLAGS = /wd4718 /wd4701 /wd4703 /wd4189 INTEL:*_*_IA32_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER -U__ICC $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_IA32) /w INTEL:*_*_X64_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER -U__ICC $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_X64) /w diff --git a/CryptoPkg/Library/OpensslLib/OpensslLibCrypto.inf b/CryptoPkg/Library/OpensslLib/OpensslLibCrypto.inf index 77ce80f2b81c..a239fca95dc1 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslLibCrypto.inf +++ b/CryptoPkg/Library/OpensslLib/OpensslLibCrypto.inf @@ -129,6 +129,7 @@ $(OPENSSL_PATH)/crypto/bio/bss_conn.c $(OPENSSL_PATH)/crypto/bio/bss_core.c $(OPENSSL_PATH)/crypto/bio/bss_dgram.c + $(OPENSSL_PATH)/crypto/bio/bss_dgram_pair.c $(OPENSSL_PATH)/crypto/bio/bss_fd.c $(OPENSSL_PATH)/crypto/bio/bss_file.c $(OPENSSL_PATH)/crypto/bio/bss_log.c @@ -170,7 +171,9 @@ $(OPENSSL_PATH)/crypto/bn/bn_x931p.c $(OPENSSL_PATH)/crypto/buffer/buf_err.c $(OPENSSL_PATH)/crypto/buffer/buffer.c + $(OPENSSL_PATH)/crypto/comp/c_brotli.c $(OPENSSL_PATH)/crypto/comp/c_zlib.c + $(OPENSSL_PATH)/crypto/comp/c_zstd.c $(OPENSSL_PATH)/crypto/comp/comp_err.c $(OPENSSL_PATH)/crypto/comp/comp_lib.c $(OPENSSL_PATH)/crypto/conf/conf_api.c @@ -210,7 +213,9 @@ $(OPENSSL_PATH)/crypto/err/err_all.c $(OPENSSL_PATH)/crypto/err/err_all_legacy.c $(OPENSSL_PATH)/crypto/err/err_blocks.c + $(OPENSSL_PATH)/crypto/err/err_mark.c $(OPENSSL_PATH)/crypto/err/err_prn.c + $(OPENSSL_PATH)/crypto/err/err_save.c $(OPENSSL_PATH)/crypto/ess/ess_asn1.c $(OPENSSL_PATH)/crypto/ess/ess_err.c $(OPENSSL_PATH)/crypto/ess/ess_lib.c @@ -293,7 +298,10 @@ $(OPENSSL_PATH)/crypto/ffc/ffc_params.c $(OPENSSL_PATH)/crypto/ffc/ffc_params_generate.c $(OPENSSL_PATH)/crypto/ffc/ffc_params_validate.c + $(OPENSSL_PATH)/crypto/hashtable/hashtable.c $(OPENSSL_PATH)/crypto/hmac/hmac.c + $(OPENSSL_PATH)/crypto/hpke/hpke.c + $(OPENSSL_PATH)/crypto/hpke/hpke_util.c $(OPENSSL_PATH)/crypto/http/http_client.c $(OPENSSL_PATH)/crypto/http/http_err.c $(OPENSSL_PATH)/crypto/http/http_lib.c @@ -302,6 +310,7 @@ $(OPENSSL_PATH)/crypto/lhash/lhash.c $(OPENSSL_PATH)/crypto/asn1_dsa.c $(OPENSSL_PATH)/crypto/bsearch.c + $(OPENSSL_PATH)/crypto/comp_methods.c $(OPENSSL_PATH)/crypto/context.c $(OPENSSL_PATH)/crypto/core_algorithm.c $(OPENSSL_PATH)/crypto/core_fetch.c @@ -311,10 +320,13 @@ $(OPENSSL_PATH)/crypto/cryptlib.c $(OPENSSL_PATH)/crypto/ctype.c $(OPENSSL_PATH)/crypto/cversion.c + $(OPENSSL_PATH)/crypto/defaults.c $(OPENSSL_PATH)/crypto/der_writer.c + $(OPENSSL_PATH)/crypto/deterministic_nonce.c $(OPENSSL_PATH)/crypto/ebcdic.c $(OPENSSL_PATH)/crypto/ex_data.c $(OPENSSL_PATH)/crypto/getenv.c + $(OPENSSL_PATH)/crypto/indicator_core.c $(OPENSSL_PATH)/crypto/info.c $(OPENSSL_PATH)/crypto/init.c $(OPENSSL_PATH)/crypto/initthread.c @@ -338,12 +350,15 @@ $(OPENSSL_PATH)/crypto/provider_conf.c $(OPENSSL_PATH)/crypto/provider_core.c $(OPENSSL_PATH)/crypto/punycode.c + $(OPENSSL_PATH)/crypto/quic_vlint.c $(OPENSSL_PATH)/crypto/self_test_core.c + $(OPENSSL_PATH)/crypto/sleep.c $(OPENSSL_PATH)/crypto/sparse_array.c $(OPENSSL_PATH)/crypto/threads_lib.c $(OPENSSL_PATH)/crypto/threads_none.c $(OPENSSL_PATH)/crypto/threads_pthread.c $(OPENSSL_PATH)/crypto/threads_win.c + $(OPENSSL_PATH)/crypto/time.c $(OPENSSL_PATH)/crypto/trace.c $(OPENSSL_PATH)/crypto/uid.c $(OPENSSL_PATH)/crypto/md5/md5_dgst.c @@ -360,6 +375,7 @@ $(OPENSSL_PATH)/crypto/modes/siv128.c $(OPENSSL_PATH)/crypto/modes/wrap128.c $(OPENSSL_PATH)/crypto/modes/xts128.c + $(OPENSSL_PATH)/crypto/modes/xts128gb.c $(OPENSSL_PATH)/crypto/objects/o_names.c $(OPENSSL_PATH)/crypto/objects/obj_dat.c $(OPENSSL_PATH)/crypto/objects/obj_err.c @@ -396,6 +412,7 @@ $(OPENSSL_PATH)/crypto/rand/rand_lib.c $(OPENSSL_PATH)/crypto/rand/rand_meth.c $(OPENSSL_PATH)/crypto/rand/rand_pool.c + $(OPENSSL_PATH)/crypto/rand/rand_uniform.c $(OPENSSL_PATH)/crypto/rsa/rsa_ameth.c $(OPENSSL_PATH)/crypto/rsa/rsa_asn1.c $(OPENSSL_PATH)/crypto/rsa/rsa_backend.c @@ -430,6 +447,8 @@ $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c $(OPENSSL_PATH)/crypto/sm3/sm3.c $(OPENSSL_PATH)/crypto/stack/stack.c + $(OPENSSL_PATH)/crypto/thread/arch/thread_win.c + $(OPENSSL_PATH)/crypto/thread/api.c $(OPENSSL_PATH)/crypto/txt_db/txt_db.c $(OPENSSL_PATH)/crypto/ui/ui_err.c $(OPENSSL_PATH)/crypto/ui/ui_lib.c @@ -445,14 +464,18 @@ $(OPENSSL_PATH)/crypto/x509/pcy_map.c $(OPENSSL_PATH)/crypto/x509/pcy_node.c $(OPENSSL_PATH)/crypto/x509/pcy_tree.c + $(OPENSSL_PATH)/crypto/x509/t_acert.c $(OPENSSL_PATH)/crypto/x509/t_crl.c $(OPENSSL_PATH)/crypto/x509/t_req.c $(OPENSSL_PATH)/crypto/x509/t_x509.c + $(OPENSSL_PATH)/crypto/x509/v3_ac_tgt.c $(OPENSSL_PATH)/crypto/x509/v3_addr.c $(OPENSSL_PATH)/crypto/x509/v3_admis.c $(OPENSSL_PATH)/crypto/x509/v3_akeya.c $(OPENSSL_PATH)/crypto/x509/v3_akid.c $(OPENSSL_PATH)/crypto/x509/v3_asid.c + $(OPENSSL_PATH)/crypto/x509/v3_audit_id.c + $(OPENSSL_PATH)/crypto/x509/v3_battcons.c $(OPENSSL_PATH)/crypto/x509/v3_bcons.c $(OPENSSL_PATH)/crypto/x509/v3_bitst.c $(OPENSSL_PATH)/crypto/x509/v3_conf.c @@ -461,12 +484,17 @@ $(OPENSSL_PATH)/crypto/x509/v3_enum.c $(OPENSSL_PATH)/crypto/x509/v3_extku.c $(OPENSSL_PATH)/crypto/x509/v3_genn.c + $(OPENSSL_PATH)/crypto/x509/v3_group_ac.c $(OPENSSL_PATH)/crypto/x509/v3_ia5.c + $(OPENSSL_PATH)/crypto/x509/v3_ind_iss.c $(OPENSSL_PATH)/crypto/x509/v3_info.c $(OPENSSL_PATH)/crypto/x509/v3_int.c + $(OPENSSL_PATH)/crypto/x509/v3_iobo.c $(OPENSSL_PATH)/crypto/x509/v3_ist.c $(OPENSSL_PATH)/crypto/x509/v3_lib.c $(OPENSSL_PATH)/crypto/x509/v3_ncons.c + $(OPENSSL_PATH)/crypto/x509/v3_no_ass.c + $(OPENSSL_PATH)/crypto/x509/v3_no_rev_avail.c $(OPENSSL_PATH)/crypto/x509/v3_pci.c $(OPENSSL_PATH)/crypto/x509/v3_pcia.c $(OPENSSL_PATH)/crypto/x509/v3_pcons.c @@ -475,12 +503,17 @@ $(OPENSSL_PATH)/crypto/x509/v3_prn.c $(OPENSSL_PATH)/crypto/x509/v3_purp.c $(OPENSSL_PATH)/crypto/x509/v3_san.c + $(OPENSSL_PATH)/crypto/x509/v3_sda.c + $(OPENSSL_PATH)/crypto/x509/v3_single_use.c $(OPENSSL_PATH)/crypto/x509/v3_skid.c + $(OPENSSL_PATH)/crypto/x509/v3_soa_id.c $(OPENSSL_PATH)/crypto/x509/v3_sxnet.c $(OPENSSL_PATH)/crypto/x509/v3_tlsf.c + $(OPENSSL_PATH)/crypto/x509/v3_usernotice.c $(OPENSSL_PATH)/crypto/x509/v3_utf8.c $(OPENSSL_PATH)/crypto/x509/v3_utl.c $(OPENSSL_PATH)/crypto/x509/v3err.c + $(OPENSSL_PATH)/crypto/x509/x509_acert.c $(OPENSSL_PATH)/crypto/x509/x509_att.c $(OPENSSL_PATH)/crypto/x509/x509_cmp.c $(OPENSSL_PATH)/crypto/x509/x509_d2.c @@ -498,6 +531,7 @@ $(OPENSSL_PATH)/crypto/x509/x509_v3.c $(OPENSSL_PATH)/crypto/x509/x509_vfy.c $(OPENSSL_PATH)/crypto/x509/x509_vpm.c + $(OPENSSL_PATH)/crypto/x509/x509aset.c $(OPENSSL_PATH)/crypto/x509/x509cset.c $(OPENSSL_PATH)/crypto/x509/x509name.c $(OPENSSL_PATH)/crypto/x509/x509rset.c @@ -507,6 +541,7 @@ $(OPENSSL_PATH)/crypto/x509/x_attrib.c $(OPENSSL_PATH)/crypto/x509/x_crl.c $(OPENSSL_PATH)/crypto/x509/x_exten.c + $(OPENSSL_PATH)/crypto/x509/x_ietfatt.c $(OPENSSL_PATH)/crypto/x509/x_name.c $(OPENSSL_PATH)/crypto/x509/x_pubkey.c $(OPENSSL_PATH)/crypto/x509/x_req.c @@ -553,7 +588,9 @@ $(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c $(OPENSSL_PATH)/providers/implementations/exchange/dh_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c + $(OPENSSL_PATH)/providers/implementations/kdfs/argon2.c $(OPENSSL_PATH)/providers/implementations/kdfs/hkdf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/hmacdrbg_kdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/kbkdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/krb5kdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/pbkdf2.c @@ -572,12 +609,12 @@ $(OPENSSL_PATH)/providers/implementations/macs/gmac_prov.c $(OPENSSL_PATH)/providers/implementations/macs/hmac_prov.c $(OPENSSL_PATH)/providers/implementations/macs/kmac_prov.c - $(OPENSSL_PATH)/providers/implementations/rands/crngt.c $(OPENSSL_PATH)/providers/implementations/rands/drbg.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_ctr.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_hash.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_hmac.c $(OPENSSL_PATH)/providers/implementations/rands/seed_src.c + $(OPENSSL_PATH)/providers/implementations/rands/seed_src_jitter.c $(OPENSSL_PATH)/providers/implementations/rands/test_rng.c $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_cpu_x86.c $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_tsc.c @@ -585,7 +622,8 @@ $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_win.c $(OPENSSL_PATH)/providers/implementations/signature/mac_legacy_sig.c $(OPENSSL_PATH)/providers/implementations/signature/rsa_sig.c - $(OPENSSL_PATH)/ssl/s3_cbc.c + $(OPENSSL_PATH)/ssl/record/methods/ssl3_cbc.c + $(OPENSSL_GEN_PATH)/crypto/params_idx.c $(OPENSSL_PATH)/providers/common/der/der_rsa_key.c $(OPENSSL_PATH)/providers/common/provider_ctx.c $(OPENSSL_PATH)/providers/common/provider_err.c @@ -597,7 +635,7 @@ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_gcm_hw.c $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_hw.c $(OPENSSL_PATH)/providers/implementations/digests/digestcommon.c - $(OPENSSL_PATH)/ssl/record/tls_pad.c + $(OPENSSL_PATH)/ssl/record/methods/tls_pad.c $(OPENSSL_GEN_PATH)/providers/common/der/der_digests_gen.c $(OPENSSL_GEN_PATH)/providers/common/der/der_rsa_gen.c $(OPENSSL_GEN_PATH)/providers/common/der/der_wrap_gen.c @@ -643,16 +681,16 @@ # C4819: The file contains a character that cannot be represented in the current code page # C4133: incompatible types - from 'ASN1_TYPE *' to 'const ASN1_STRING *' (v3_genn.c(101)) # - MSFT:*_*_IA32_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_NOASM) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4310 /wd4389 /wd4700 /wd4702 /wd4706 /wd4819 /wd4133 - MSFT:*_*_X64_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_NOASM) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4306 /wd4310 /wd4700 /wd4389 /wd4702 /wd4706 /wd4819 /wd4133 + MSFT:*_*_IA32_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_NOASM) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4310 /wd4389 /wd4700 /wd4702 /wd4706 /wd4819 /wd4133 /wd4701 /wd4703 /wd4189 + MSFT:*_*_X64_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_NOASM) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4306 /wd4310 /wd4700 /wd4389 /wd4702 /wd4706 /wd4819 /wd4133 /wd4701 /wd4703 /wd4189 # # Disable following Visual Studio 2015 compiler warnings brought by openssl source, # so we do not break the build with /WX option: # C4718: recursive call has no side effects, deleting # - MSFT:*_VS2015x86_IA32_CC_FLAGS = /wd4718 - MSFT:*_VS2015x86_X64_CC_FLAGS = /wd4718 + MSFT:*_VS2015x86_IA32_CC_FLAGS = /wd4718 /wd4701 /wd4703 /wd4189 + MSFT:*_VS2015x86_X64_CC_FLAGS = /wd4718 /wd4701 /wd4703 /wd4189 INTEL:*_*_IA32_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER -U__ICC $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_NOASM) /w INTEL:*_*_X64_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER -U__ICC $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_NOASM) /w diff --git a/CryptoPkg/Library/OpensslLib/OpensslLibFull.inf b/CryptoPkg/Library/OpensslLib/OpensslLibFull.inf index 32c79c39bb4f..cdcab5eba5bc 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslLibFull.inf +++ b/CryptoPkg/Library/OpensslLib/OpensslLibFull.inf @@ -133,6 +133,7 @@ $(OPENSSL_PATH)/crypto/bio/bss_conn.c $(OPENSSL_PATH)/crypto/bio/bss_core.c $(OPENSSL_PATH)/crypto/bio/bss_dgram.c + $(OPENSSL_PATH)/crypto/bio/bss_dgram_pair.c $(OPENSSL_PATH)/crypto/bio/bss_fd.c $(OPENSSL_PATH)/crypto/bio/bss_file.c $(OPENSSL_PATH)/crypto/bio/bss_log.c @@ -174,7 +175,9 @@ $(OPENSSL_PATH)/crypto/bn/bn_x931p.c $(OPENSSL_PATH)/crypto/buffer/buf_err.c $(OPENSSL_PATH)/crypto/buffer/buffer.c + $(OPENSSL_PATH)/crypto/comp/c_brotli.c $(OPENSSL_PATH)/crypto/comp/c_zlib.c + $(OPENSSL_PATH)/crypto/comp/c_zstd.c $(OPENSSL_PATH)/crypto/comp/comp_err.c $(OPENSSL_PATH)/crypto/comp/comp_lib.c $(OPENSSL_PATH)/crypto/conf/conf_api.c @@ -252,7 +255,9 @@ $(OPENSSL_PATH)/crypto/err/err_all.c $(OPENSSL_PATH)/crypto/err/err_all_legacy.c $(OPENSSL_PATH)/crypto/err/err_blocks.c + $(OPENSSL_PATH)/crypto/err/err_mark.c $(OPENSSL_PATH)/crypto/err/err_prn.c + $(OPENSSL_PATH)/crypto/err/err_save.c $(OPENSSL_PATH)/crypto/ess/ess_asn1.c $(OPENSSL_PATH)/crypto/ess/ess_err.c $(OPENSSL_PATH)/crypto/ess/ess_lib.c @@ -335,7 +340,10 @@ $(OPENSSL_PATH)/crypto/ffc/ffc_params.c $(OPENSSL_PATH)/crypto/ffc/ffc_params_generate.c $(OPENSSL_PATH)/crypto/ffc/ffc_params_validate.c + $(OPENSSL_PATH)/crypto/hashtable/hashtable.c $(OPENSSL_PATH)/crypto/hmac/hmac.c + $(OPENSSL_PATH)/crypto/hpke/hpke.c + $(OPENSSL_PATH)/crypto/hpke/hpke_util.c $(OPENSSL_PATH)/crypto/http/http_client.c $(OPENSSL_PATH)/crypto/http/http_err.c $(OPENSSL_PATH)/crypto/http/http_lib.c @@ -344,6 +352,7 @@ $(OPENSSL_PATH)/crypto/lhash/lhash.c $(OPENSSL_PATH)/crypto/asn1_dsa.c $(OPENSSL_PATH)/crypto/bsearch.c + $(OPENSSL_PATH)/crypto/comp_methods.c $(OPENSSL_PATH)/crypto/context.c $(OPENSSL_PATH)/crypto/core_algorithm.c $(OPENSSL_PATH)/crypto/core_fetch.c @@ -353,10 +362,13 @@ $(OPENSSL_PATH)/crypto/cryptlib.c $(OPENSSL_PATH)/crypto/ctype.c $(OPENSSL_PATH)/crypto/cversion.c + $(OPENSSL_PATH)/crypto/defaults.c $(OPENSSL_PATH)/crypto/der_writer.c + $(OPENSSL_PATH)/crypto/deterministic_nonce.c $(OPENSSL_PATH)/crypto/ebcdic.c $(OPENSSL_PATH)/crypto/ex_data.c $(OPENSSL_PATH)/crypto/getenv.c + $(OPENSSL_PATH)/crypto/indicator_core.c $(OPENSSL_PATH)/crypto/info.c $(OPENSSL_PATH)/crypto/init.c $(OPENSSL_PATH)/crypto/initthread.c @@ -380,12 +392,15 @@ $(OPENSSL_PATH)/crypto/provider_conf.c $(OPENSSL_PATH)/crypto/provider_core.c $(OPENSSL_PATH)/crypto/punycode.c + $(OPENSSL_PATH)/crypto/quic_vlint.c $(OPENSSL_PATH)/crypto/self_test_core.c + $(OPENSSL_PATH)/crypto/sleep.c $(OPENSSL_PATH)/crypto/sparse_array.c $(OPENSSL_PATH)/crypto/threads_lib.c $(OPENSSL_PATH)/crypto/threads_none.c $(OPENSSL_PATH)/crypto/threads_pthread.c $(OPENSSL_PATH)/crypto/threads_win.c + $(OPENSSL_PATH)/crypto/time.c $(OPENSSL_PATH)/crypto/trace.c $(OPENSSL_PATH)/crypto/uid.c $(OPENSSL_PATH)/crypto/md5/md5_dgst.c @@ -402,6 +417,7 @@ $(OPENSSL_PATH)/crypto/modes/siv128.c $(OPENSSL_PATH)/crypto/modes/wrap128.c $(OPENSSL_PATH)/crypto/modes/xts128.c + $(OPENSSL_PATH)/crypto/modes/xts128gb.c $(OPENSSL_PATH)/crypto/objects/o_names.c $(OPENSSL_PATH)/crypto/objects/obj_dat.c $(OPENSSL_PATH)/crypto/objects/obj_err.c @@ -438,6 +454,7 @@ $(OPENSSL_PATH)/crypto/rand/rand_lib.c $(OPENSSL_PATH)/crypto/rand/rand_meth.c $(OPENSSL_PATH)/crypto/rand/rand_pool.c + $(OPENSSL_PATH)/crypto/rand/rand_uniform.c $(OPENSSL_PATH)/crypto/rsa/rsa_ameth.c $(OPENSSL_PATH)/crypto/rsa/rsa_asn1.c $(OPENSSL_PATH)/crypto/rsa/rsa_backend.c @@ -472,6 +489,8 @@ $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c $(OPENSSL_PATH)/crypto/sm3/sm3.c $(OPENSSL_PATH)/crypto/stack/stack.c + $(OPENSSL_PATH)/crypto/thread/arch/thread_win.c + $(OPENSSL_PATH)/crypto/thread/api.c $(OPENSSL_PATH)/crypto/txt_db/txt_db.c $(OPENSSL_PATH)/crypto/ui/ui_err.c $(OPENSSL_PATH)/crypto/ui/ui_lib.c @@ -487,14 +506,18 @@ $(OPENSSL_PATH)/crypto/x509/pcy_map.c $(OPENSSL_PATH)/crypto/x509/pcy_node.c $(OPENSSL_PATH)/crypto/x509/pcy_tree.c + $(OPENSSL_PATH)/crypto/x509/t_acert.c $(OPENSSL_PATH)/crypto/x509/t_crl.c $(OPENSSL_PATH)/crypto/x509/t_req.c $(OPENSSL_PATH)/crypto/x509/t_x509.c + $(OPENSSL_PATH)/crypto/x509/v3_ac_tgt.c $(OPENSSL_PATH)/crypto/x509/v3_addr.c $(OPENSSL_PATH)/crypto/x509/v3_admis.c $(OPENSSL_PATH)/crypto/x509/v3_akeya.c $(OPENSSL_PATH)/crypto/x509/v3_akid.c $(OPENSSL_PATH)/crypto/x509/v3_asid.c + $(OPENSSL_PATH)/crypto/x509/v3_audit_id.c + $(OPENSSL_PATH)/crypto/x509/v3_battcons.c $(OPENSSL_PATH)/crypto/x509/v3_bcons.c $(OPENSSL_PATH)/crypto/x509/v3_bitst.c $(OPENSSL_PATH)/crypto/x509/v3_conf.c @@ -503,12 +526,17 @@ $(OPENSSL_PATH)/crypto/x509/v3_enum.c $(OPENSSL_PATH)/crypto/x509/v3_extku.c $(OPENSSL_PATH)/crypto/x509/v3_genn.c + $(OPENSSL_PATH)/crypto/x509/v3_group_ac.c $(OPENSSL_PATH)/crypto/x509/v3_ia5.c + $(OPENSSL_PATH)/crypto/x509/v3_ind_iss.c $(OPENSSL_PATH)/crypto/x509/v3_info.c $(OPENSSL_PATH)/crypto/x509/v3_int.c + $(OPENSSL_PATH)/crypto/x509/v3_iobo.c $(OPENSSL_PATH)/crypto/x509/v3_ist.c $(OPENSSL_PATH)/crypto/x509/v3_lib.c $(OPENSSL_PATH)/crypto/x509/v3_ncons.c + $(OPENSSL_PATH)/crypto/x509/v3_no_ass.c + $(OPENSSL_PATH)/crypto/x509/v3_no_rev_avail.c $(OPENSSL_PATH)/crypto/x509/v3_pci.c $(OPENSSL_PATH)/crypto/x509/v3_pcia.c $(OPENSSL_PATH)/crypto/x509/v3_pcons.c @@ -517,12 +545,17 @@ $(OPENSSL_PATH)/crypto/x509/v3_prn.c $(OPENSSL_PATH)/crypto/x509/v3_purp.c $(OPENSSL_PATH)/crypto/x509/v3_san.c + $(OPENSSL_PATH)/crypto/x509/v3_sda.c + $(OPENSSL_PATH)/crypto/x509/v3_single_use.c $(OPENSSL_PATH)/crypto/x509/v3_skid.c + $(OPENSSL_PATH)/crypto/x509/v3_soa_id.c $(OPENSSL_PATH)/crypto/x509/v3_sxnet.c $(OPENSSL_PATH)/crypto/x509/v3_tlsf.c + $(OPENSSL_PATH)/crypto/x509/v3_usernotice.c $(OPENSSL_PATH)/crypto/x509/v3_utf8.c $(OPENSSL_PATH)/crypto/x509/v3_utl.c $(OPENSSL_PATH)/crypto/x509/v3err.c + $(OPENSSL_PATH)/crypto/x509/x509_acert.c $(OPENSSL_PATH)/crypto/x509/x509_att.c $(OPENSSL_PATH)/crypto/x509/x509_cmp.c $(OPENSSL_PATH)/crypto/x509/x509_d2.c @@ -540,6 +573,7 @@ $(OPENSSL_PATH)/crypto/x509/x509_v3.c $(OPENSSL_PATH)/crypto/x509/x509_vfy.c $(OPENSSL_PATH)/crypto/x509/x509_vpm.c + $(OPENSSL_PATH)/crypto/x509/x509aset.c $(OPENSSL_PATH)/crypto/x509/x509cset.c $(OPENSSL_PATH)/crypto/x509/x509name.c $(OPENSSL_PATH)/crypto/x509/x509rset.c @@ -549,6 +583,7 @@ $(OPENSSL_PATH)/crypto/x509/x_attrib.c $(OPENSSL_PATH)/crypto/x509/x_crl.c $(OPENSSL_PATH)/crypto/x509/x_exten.c + $(OPENSSL_PATH)/crypto/x509/x_ietfatt.c $(OPENSSL_PATH)/crypto/x509/x_name.c $(OPENSSL_PATH)/crypto/x509/x_pubkey.c $(OPENSSL_PATH)/crypto/x509/x_req.c @@ -597,7 +632,9 @@ $(OPENSSL_PATH)/providers/implementations/exchange/ecdh_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/ecx_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c + $(OPENSSL_PATH)/providers/implementations/kdfs/argon2.c $(OPENSSL_PATH)/providers/implementations/kdfs/hkdf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/hmacdrbg_kdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/kbkdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/krb5kdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/pbkdf2.c @@ -608,6 +645,9 @@ $(OPENSSL_PATH)/providers/implementations/kdfs/sskdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c $(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c + $(OPENSSL_PATH)/providers/implementations/kem/ec_kem.c + $(OPENSSL_PATH)/providers/implementations/kem/ecx_kem.c + $(OPENSSL_PATH)/providers/implementations/kem/kem_util.c $(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c $(OPENSSL_PATH)/providers/implementations/keymgmt/dh_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/ec_kmgmt.c @@ -618,12 +658,12 @@ $(OPENSSL_PATH)/providers/implementations/macs/gmac_prov.c $(OPENSSL_PATH)/providers/implementations/macs/hmac_prov.c $(OPENSSL_PATH)/providers/implementations/macs/kmac_prov.c - $(OPENSSL_PATH)/providers/implementations/rands/crngt.c $(OPENSSL_PATH)/providers/implementations/rands/drbg.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_ctr.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_hash.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_hmac.c $(OPENSSL_PATH)/providers/implementations/rands/seed_src.c + $(OPENSSL_PATH)/providers/implementations/rands/seed_src_jitter.c $(OPENSSL_PATH)/providers/implementations/rands/test_rng.c $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_cpu_x86.c $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_tsc.c @@ -633,7 +673,8 @@ $(OPENSSL_PATH)/providers/implementations/signature/eddsa_sig.c $(OPENSSL_PATH)/providers/implementations/signature/mac_legacy_sig.c $(OPENSSL_PATH)/providers/implementations/signature/rsa_sig.c - $(OPENSSL_PATH)/ssl/s3_cbc.c + $(OPENSSL_PATH)/ssl/record/methods/ssl3_cbc.c + $(OPENSSL_GEN_PATH)/crypto/params_idx.c $(OPENSSL_PATH)/providers/common/der/der_ec_key.c $(OPENSSL_PATH)/providers/common/der/der_ec_sig.c $(OPENSSL_PATH)/providers/common/der/der_ecx_key.c @@ -648,7 +689,7 @@ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_gcm_hw.c $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_hw.c $(OPENSSL_PATH)/providers/implementations/digests/digestcommon.c - $(OPENSSL_PATH)/ssl/record/tls_pad.c + $(OPENSSL_PATH)/ssl/record/methods/tls_pad.c $(OPENSSL_GEN_PATH)/providers/common/der/der_digests_gen.c $(OPENSSL_GEN_PATH)/providers/common/der/der_ec_gen.c $(OPENSSL_GEN_PATH)/providers/common/der/der_ecx_gen.c @@ -665,6 +706,7 @@ $(OPENSSL_PATH)/ssl/s3_msg.c $(OPENSSL_PATH)/ssl/ssl_asn1.c $(OPENSSL_PATH)/ssl/ssl_cert.c + $(OPENSSL_PATH)/ssl/ssl_cert_comp.c $(OPENSSL_PATH)/ssl/ssl_ciph.c $(OPENSSL_PATH)/ssl/ssl_conf.c $(OPENSSL_PATH)/ssl/ssl_err.c @@ -684,12 +726,16 @@ $(OPENSSL_PATH)/ssl/tls13_enc.c $(OPENSSL_PATH)/ssl/tls_depr.c $(OPENSSL_PATH)/ssl/tls_srp.c - $(OPENSSL_PATH)/ssl/record/dtls1_bitmap.c $(OPENSSL_PATH)/ssl/record/rec_layer_d1.c $(OPENSSL_PATH)/ssl/record/rec_layer_s3.c - $(OPENSSL_PATH)/ssl/record/ssl3_buffer.c - $(OPENSSL_PATH)/ssl/record/ssl3_record.c - $(OPENSSL_PATH)/ssl/record/ssl3_record_tls13.c + $(OPENSSL_PATH)/ssl/record/methods/dtls_meth.c + $(OPENSSL_PATH)/ssl/record/methods/ssl3_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls13_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls1_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls_common.c + $(OPENSSL_PATH)/ssl/record/methods/tls_multib.c + $(OPENSSL_PATH)/ssl/record/methods/tlsany_meth.c + $(OPENSSL_PATH)/ssl/rio/poll_immediate.c $(OPENSSL_PATH)/ssl/statem/extensions.c $(OPENSSL_PATH)/ssl/statem/extensions_clnt.c $(OPENSSL_PATH)/ssl/statem/extensions_cust.c @@ -741,16 +787,16 @@ # C4819: The file contains a character that cannot be represented in the current code page # C4133: incompatible types - from 'ASN1_TYPE *' to 'const ASN1_STRING *' (v3_genn.c(101)) # - MSFT:*_*_IA32_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_NOASM) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4310 /wd4389 /wd4700 /wd4702 /wd4706 /wd4819 /wd4133 - MSFT:*_*_X64_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_NOASM) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4306 /wd4310 /wd4700 /wd4389 /wd4702 /wd4706 /wd4819 /wd4133 + MSFT:*_*_IA32_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_NOASM) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4310 /wd4389 /wd4700 /wd4702 /wd4706 /wd4819 /wd4133 /wd4701 /wd4703 /wd4189 + MSFT:*_*_X64_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_NOASM) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4306 /wd4310 /wd4700 /wd4389 /wd4702 /wd4706 /wd4819 /wd4133 /wd4701 /wd4703 /wd4189 # # Disable following Visual Studio 2015 compiler warnings brought by openssl source, # so we do not break the build with /WX option: # C4718: recursive call has no side effects, deleting # - MSFT:*_VS2015x86_IA32_CC_FLAGS = /wd4718 - MSFT:*_VS2015x86_X64_CC_FLAGS = /wd4718 + MSFT:*_VS2015x86_IA32_CC_FLAGS = /wd4718 /wd4701 /wd4703 /wd4189 + MSFT:*_VS2015x86_X64_CC_FLAGS = /wd4718 /wd4701 /wd4703 /wd4189 INTEL:*_*_IA32_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER -U__ICC $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_NOASM) /w INTEL:*_*_X64_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER -U__ICC $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_NOASM) /w diff --git a/CryptoPkg/Library/OpensslLib/OpensslLibFullAccel.inf b/CryptoPkg/Library/OpensslLib/OpensslLibFullAccel.inf index bbcf7485f15d..32665b231bfb 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslLibFullAccel.inf +++ b/CryptoPkg/Library/OpensslLib/OpensslLibFullAccel.inf @@ -30,7 +30,7 @@ DEFINE OPENSSL_FLAGS = -DL_ENDIAN -DOPENSSL_SMALL_FOOTPRINT -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE DEFINE OPENSSL_FLAGS_IA32 = -DAES_ASM -DGHASH_ASM -DMD5_ASM -DOPENSSL_CPUID_OBJ -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM -DVPAES_ASM DEFINE OPENSSL_FLAGS_X64 = -DAES_ASM -DBSAES_ASM -DGHASH_ASM -DKECCAK1600_ASM -DMD5_ASM -DOPENSSL_CPUID_OBJ -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM -DVPAES_ASM - DEFINE OPENSSL_FLAGS_AARCH64 = -DKECCAK1600_ASM -DOPENSSL_CPUID_OBJ -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM -DVPAES_ASM + DEFINE OPENSSL_FLAGS_AARCH64 = -DBSAES_ASM -DKECCAK1600_ASM -DMD5_ASM -DOPENSSL_CPUID_OBJ -DOPENSSL_SM3_ASM -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM -DVPAES_ASM # # VALID_ARCHITECTURES = IA32 X64 AARCH64 @@ -151,6 +151,7 @@ $(OPENSSL_PATH)/crypto/bio/bss_conn.c $(OPENSSL_PATH)/crypto/bio/bss_core.c $(OPENSSL_PATH)/crypto/bio/bss_dgram.c + $(OPENSSL_PATH)/crypto/bio/bss_dgram_pair.c $(OPENSSL_PATH)/crypto/bio/bss_fd.c $(OPENSSL_PATH)/crypto/bio/bss_file.c $(OPENSSL_PATH)/crypto/bio/bss_log.c @@ -191,7 +192,9 @@ $(OPENSSL_PATH)/crypto/bn/bn_x931p.c $(OPENSSL_PATH)/crypto/buffer/buf_err.c $(OPENSSL_PATH)/crypto/buffer/buffer.c + $(OPENSSL_PATH)/crypto/comp/c_brotli.c $(OPENSSL_PATH)/crypto/comp/c_zlib.c + $(OPENSSL_PATH)/crypto/comp/c_zstd.c $(OPENSSL_PATH)/crypto/comp/comp_err.c $(OPENSSL_PATH)/crypto/comp/comp_lib.c $(OPENSSL_PATH)/crypto/conf/conf_api.c @@ -269,7 +272,9 @@ $(OPENSSL_PATH)/crypto/err/err_all.c $(OPENSSL_PATH)/crypto/err/err_all_legacy.c $(OPENSSL_PATH)/crypto/err/err_blocks.c + $(OPENSSL_PATH)/crypto/err/err_mark.c $(OPENSSL_PATH)/crypto/err/err_prn.c + $(OPENSSL_PATH)/crypto/err/err_save.c $(OPENSSL_PATH)/crypto/ess/ess_asn1.c $(OPENSSL_PATH)/crypto/ess/ess_err.c $(OPENSSL_PATH)/crypto/ess/ess_lib.c @@ -352,7 +357,10 @@ $(OPENSSL_PATH)/crypto/ffc/ffc_params.c $(OPENSSL_PATH)/crypto/ffc/ffc_params_generate.c $(OPENSSL_PATH)/crypto/ffc/ffc_params_validate.c + $(OPENSSL_PATH)/crypto/hashtable/hashtable.c $(OPENSSL_PATH)/crypto/hmac/hmac.c + $(OPENSSL_PATH)/crypto/hpke/hpke.c + $(OPENSSL_PATH)/crypto/hpke/hpke_util.c $(OPENSSL_PATH)/crypto/http/http_client.c $(OPENSSL_PATH)/crypto/http/http_err.c $(OPENSSL_PATH)/crypto/http/http_lib.c @@ -361,6 +369,7 @@ $(OPENSSL_PATH)/crypto/lhash/lhash.c $(OPENSSL_PATH)/crypto/asn1_dsa.c $(OPENSSL_PATH)/crypto/bsearch.c + $(OPENSSL_PATH)/crypto/comp_methods.c $(OPENSSL_PATH)/crypto/context.c $(OPENSSL_PATH)/crypto/core_algorithm.c $(OPENSSL_PATH)/crypto/core_fetch.c @@ -370,10 +379,13 @@ $(OPENSSL_PATH)/crypto/cryptlib.c $(OPENSSL_PATH)/crypto/ctype.c $(OPENSSL_PATH)/crypto/cversion.c + $(OPENSSL_PATH)/crypto/defaults.c $(OPENSSL_PATH)/crypto/der_writer.c + $(OPENSSL_PATH)/crypto/deterministic_nonce.c $(OPENSSL_PATH)/crypto/ebcdic.c $(OPENSSL_PATH)/crypto/ex_data.c $(OPENSSL_PATH)/crypto/getenv.c + $(OPENSSL_PATH)/crypto/indicator_core.c $(OPENSSL_PATH)/crypto/info.c $(OPENSSL_PATH)/crypto/init.c $(OPENSSL_PATH)/crypto/initthread.c @@ -396,12 +408,15 @@ $(OPENSSL_PATH)/crypto/provider_conf.c $(OPENSSL_PATH)/crypto/provider_core.c $(OPENSSL_PATH)/crypto/punycode.c + $(OPENSSL_PATH)/crypto/quic_vlint.c $(OPENSSL_PATH)/crypto/self_test_core.c + $(OPENSSL_PATH)/crypto/sleep.c $(OPENSSL_PATH)/crypto/sparse_array.c $(OPENSSL_PATH)/crypto/threads_lib.c $(OPENSSL_PATH)/crypto/threads_none.c $(OPENSSL_PATH)/crypto/threads_pthread.c $(OPENSSL_PATH)/crypto/threads_win.c + $(OPENSSL_PATH)/crypto/time.c $(OPENSSL_PATH)/crypto/trace.c $(OPENSSL_PATH)/crypto/uid.c $(OPENSSL_PATH)/crypto/md5/md5_dgst.c @@ -418,6 +433,7 @@ $(OPENSSL_PATH)/crypto/modes/siv128.c $(OPENSSL_PATH)/crypto/modes/wrap128.c $(OPENSSL_PATH)/crypto/modes/xts128.c + $(OPENSSL_PATH)/crypto/modes/xts128gb.c $(OPENSSL_PATH)/crypto/objects/o_names.c $(OPENSSL_PATH)/crypto/objects/obj_dat.c $(OPENSSL_PATH)/crypto/objects/obj_err.c @@ -454,6 +470,7 @@ $(OPENSSL_PATH)/crypto/rand/rand_lib.c $(OPENSSL_PATH)/crypto/rand/rand_meth.c $(OPENSSL_PATH)/crypto/rand/rand_pool.c + $(OPENSSL_PATH)/crypto/rand/rand_uniform.c $(OPENSSL_PATH)/crypto/rsa/rsa_ameth.c $(OPENSSL_PATH)/crypto/rsa/rsa_asn1.c $(OPENSSL_PATH)/crypto/rsa/rsa_backend.c @@ -488,6 +505,8 @@ $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c $(OPENSSL_PATH)/crypto/sm3/sm3.c $(OPENSSL_PATH)/crypto/stack/stack.c + $(OPENSSL_PATH)/crypto/thread/arch/thread_win.c + $(OPENSSL_PATH)/crypto/thread/api.c $(OPENSSL_PATH)/crypto/txt_db/txt_db.c $(OPENSSL_PATH)/crypto/ui/ui_err.c $(OPENSSL_PATH)/crypto/ui/ui_lib.c @@ -503,14 +522,18 @@ $(OPENSSL_PATH)/crypto/x509/pcy_map.c $(OPENSSL_PATH)/crypto/x509/pcy_node.c $(OPENSSL_PATH)/crypto/x509/pcy_tree.c + $(OPENSSL_PATH)/crypto/x509/t_acert.c $(OPENSSL_PATH)/crypto/x509/t_crl.c $(OPENSSL_PATH)/crypto/x509/t_req.c $(OPENSSL_PATH)/crypto/x509/t_x509.c + $(OPENSSL_PATH)/crypto/x509/v3_ac_tgt.c $(OPENSSL_PATH)/crypto/x509/v3_addr.c $(OPENSSL_PATH)/crypto/x509/v3_admis.c $(OPENSSL_PATH)/crypto/x509/v3_akeya.c $(OPENSSL_PATH)/crypto/x509/v3_akid.c $(OPENSSL_PATH)/crypto/x509/v3_asid.c + $(OPENSSL_PATH)/crypto/x509/v3_audit_id.c + $(OPENSSL_PATH)/crypto/x509/v3_battcons.c $(OPENSSL_PATH)/crypto/x509/v3_bcons.c $(OPENSSL_PATH)/crypto/x509/v3_bitst.c $(OPENSSL_PATH)/crypto/x509/v3_conf.c @@ -519,12 +542,17 @@ $(OPENSSL_PATH)/crypto/x509/v3_enum.c $(OPENSSL_PATH)/crypto/x509/v3_extku.c $(OPENSSL_PATH)/crypto/x509/v3_genn.c + $(OPENSSL_PATH)/crypto/x509/v3_group_ac.c $(OPENSSL_PATH)/crypto/x509/v3_ia5.c + $(OPENSSL_PATH)/crypto/x509/v3_ind_iss.c $(OPENSSL_PATH)/crypto/x509/v3_info.c $(OPENSSL_PATH)/crypto/x509/v3_int.c + $(OPENSSL_PATH)/crypto/x509/v3_iobo.c $(OPENSSL_PATH)/crypto/x509/v3_ist.c $(OPENSSL_PATH)/crypto/x509/v3_lib.c $(OPENSSL_PATH)/crypto/x509/v3_ncons.c + $(OPENSSL_PATH)/crypto/x509/v3_no_ass.c + $(OPENSSL_PATH)/crypto/x509/v3_no_rev_avail.c $(OPENSSL_PATH)/crypto/x509/v3_pci.c $(OPENSSL_PATH)/crypto/x509/v3_pcia.c $(OPENSSL_PATH)/crypto/x509/v3_pcons.c @@ -533,12 +561,17 @@ $(OPENSSL_PATH)/crypto/x509/v3_prn.c $(OPENSSL_PATH)/crypto/x509/v3_purp.c $(OPENSSL_PATH)/crypto/x509/v3_san.c + $(OPENSSL_PATH)/crypto/x509/v3_sda.c + $(OPENSSL_PATH)/crypto/x509/v3_single_use.c $(OPENSSL_PATH)/crypto/x509/v3_skid.c + $(OPENSSL_PATH)/crypto/x509/v3_soa_id.c $(OPENSSL_PATH)/crypto/x509/v3_sxnet.c $(OPENSSL_PATH)/crypto/x509/v3_tlsf.c + $(OPENSSL_PATH)/crypto/x509/v3_usernotice.c $(OPENSSL_PATH)/crypto/x509/v3_utf8.c $(OPENSSL_PATH)/crypto/x509/v3_utl.c $(OPENSSL_PATH)/crypto/x509/v3err.c + $(OPENSSL_PATH)/crypto/x509/x509_acert.c $(OPENSSL_PATH)/crypto/x509/x509_att.c $(OPENSSL_PATH)/crypto/x509/x509_cmp.c $(OPENSSL_PATH)/crypto/x509/x509_d2.c @@ -556,6 +589,7 @@ $(OPENSSL_PATH)/crypto/x509/x509_v3.c $(OPENSSL_PATH)/crypto/x509/x509_vfy.c $(OPENSSL_PATH)/crypto/x509/x509_vpm.c + $(OPENSSL_PATH)/crypto/x509/x509aset.c $(OPENSSL_PATH)/crypto/x509/x509cset.c $(OPENSSL_PATH)/crypto/x509/x509name.c $(OPENSSL_PATH)/crypto/x509/x509rset.c @@ -565,6 +599,7 @@ $(OPENSSL_PATH)/crypto/x509/x_attrib.c $(OPENSSL_PATH)/crypto/x509/x_crl.c $(OPENSSL_PATH)/crypto/x509/x_exten.c + $(OPENSSL_PATH)/crypto/x509/x_ietfatt.c $(OPENSSL_PATH)/crypto/x509/x_name.c $(OPENSSL_PATH)/crypto/x509/x_pubkey.c $(OPENSSL_PATH)/crypto/x509/x_req.c @@ -613,7 +648,9 @@ $(OPENSSL_PATH)/providers/implementations/exchange/ecdh_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/ecx_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c + $(OPENSSL_PATH)/providers/implementations/kdfs/argon2.c $(OPENSSL_PATH)/providers/implementations/kdfs/hkdf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/hmacdrbg_kdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/kbkdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/krb5kdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/pbkdf2.c @@ -624,6 +661,9 @@ $(OPENSSL_PATH)/providers/implementations/kdfs/sskdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c $(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c + $(OPENSSL_PATH)/providers/implementations/kem/ec_kem.c + $(OPENSSL_PATH)/providers/implementations/kem/ecx_kem.c + $(OPENSSL_PATH)/providers/implementations/kem/kem_util.c $(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c $(OPENSSL_PATH)/providers/implementations/keymgmt/dh_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/ec_kmgmt.c @@ -634,12 +674,12 @@ $(OPENSSL_PATH)/providers/implementations/macs/gmac_prov.c $(OPENSSL_PATH)/providers/implementations/macs/hmac_prov.c $(OPENSSL_PATH)/providers/implementations/macs/kmac_prov.c - $(OPENSSL_PATH)/providers/implementations/rands/crngt.c $(OPENSSL_PATH)/providers/implementations/rands/drbg.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_ctr.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_hash.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_hmac.c $(OPENSSL_PATH)/providers/implementations/rands/seed_src.c + $(OPENSSL_PATH)/providers/implementations/rands/seed_src_jitter.c $(OPENSSL_PATH)/providers/implementations/rands/test_rng.c $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_cpu_x86.c $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_tsc.c @@ -649,7 +689,8 @@ $(OPENSSL_PATH)/providers/implementations/signature/eddsa_sig.c $(OPENSSL_PATH)/providers/implementations/signature/mac_legacy_sig.c $(OPENSSL_PATH)/providers/implementations/signature/rsa_sig.c - $(OPENSSL_PATH)/ssl/s3_cbc.c + $(OPENSSL_PATH)/ssl/record/methods/ssl3_cbc.c + $(OPENSSL_GEN_PATH)/crypto/params_idx.c $(OPENSSL_PATH)/providers/common/der/der_ec_key.c $(OPENSSL_PATH)/providers/common/der/der_ec_sig.c $(OPENSSL_PATH)/providers/common/der/der_ecx_key.c @@ -664,7 +705,7 @@ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_gcm_hw.c $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_hw.c $(OPENSSL_PATH)/providers/implementations/digests/digestcommon.c - $(OPENSSL_PATH)/ssl/record/tls_pad.c + $(OPENSSL_PATH)/ssl/record/methods/tls_pad.c $(OPENSSL_GEN_PATH)/providers/common/der/der_digests_gen.c $(OPENSSL_GEN_PATH)/providers/common/der/der_ec_gen.c $(OPENSSL_GEN_PATH)/providers/common/der/der_ecx_gen.c @@ -681,6 +722,7 @@ $(OPENSSL_PATH)/ssl/s3_msg.c $(OPENSSL_PATH)/ssl/ssl_asn1.c $(OPENSSL_PATH)/ssl/ssl_cert.c + $(OPENSSL_PATH)/ssl/ssl_cert_comp.c $(OPENSSL_PATH)/ssl/ssl_ciph.c $(OPENSSL_PATH)/ssl/ssl_conf.c $(OPENSSL_PATH)/ssl/ssl_err.c @@ -700,12 +742,16 @@ $(OPENSSL_PATH)/ssl/tls13_enc.c $(OPENSSL_PATH)/ssl/tls_depr.c $(OPENSSL_PATH)/ssl/tls_srp.c - $(OPENSSL_PATH)/ssl/record/dtls1_bitmap.c $(OPENSSL_PATH)/ssl/record/rec_layer_d1.c $(OPENSSL_PATH)/ssl/record/rec_layer_s3.c - $(OPENSSL_PATH)/ssl/record/ssl3_buffer.c - $(OPENSSL_PATH)/ssl/record/ssl3_record.c - $(OPENSSL_PATH)/ssl/record/ssl3_record_tls13.c + $(OPENSSL_PATH)/ssl/record/methods/dtls_meth.c + $(OPENSSL_PATH)/ssl/record/methods/ssl3_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls13_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls1_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls_common.c + $(OPENSSL_PATH)/ssl/record/methods/tls_multib.c + $(OPENSSL_PATH)/ssl/record/methods/tlsany_meth.c + $(OPENSSL_PATH)/ssl/rio/poll_immediate.c $(OPENSSL_PATH)/ssl/statem/extensions.c $(OPENSSL_PATH)/ssl/statem/extensions_clnt.c $(OPENSSL_PATH)/ssl/statem/extensions_cust.c @@ -831,6 +877,7 @@ $(OPENSSL_PATH)/crypto/bio/bss_conn.c $(OPENSSL_PATH)/crypto/bio/bss_core.c $(OPENSSL_PATH)/crypto/bio/bss_dgram.c + $(OPENSSL_PATH)/crypto/bio/bss_dgram_pair.c $(OPENSSL_PATH)/crypto/bio/bss_fd.c $(OPENSSL_PATH)/crypto/bio/bss_file.c $(OPENSSL_PATH)/crypto/bio/bss_log.c @@ -873,7 +920,9 @@ $(OPENSSL_PATH)/crypto/bn/rsaz_exp_x2.c $(OPENSSL_PATH)/crypto/buffer/buf_err.c $(OPENSSL_PATH)/crypto/buffer/buffer.c + $(OPENSSL_PATH)/crypto/comp/c_brotli.c $(OPENSSL_PATH)/crypto/comp/c_zlib.c + $(OPENSSL_PATH)/crypto/comp/c_zstd.c $(OPENSSL_PATH)/crypto/comp/comp_err.c $(OPENSSL_PATH)/crypto/comp/comp_lib.c $(OPENSSL_PATH)/crypto/conf/conf_api.c @@ -951,7 +1000,9 @@ $(OPENSSL_PATH)/crypto/err/err_all.c $(OPENSSL_PATH)/crypto/err/err_all_legacy.c $(OPENSSL_PATH)/crypto/err/err_blocks.c + $(OPENSSL_PATH)/crypto/err/err_mark.c $(OPENSSL_PATH)/crypto/err/err_prn.c + $(OPENSSL_PATH)/crypto/err/err_save.c $(OPENSSL_PATH)/crypto/ess/ess_asn1.c $(OPENSSL_PATH)/crypto/ess/ess_err.c $(OPENSSL_PATH)/crypto/ess/ess_lib.c @@ -1034,7 +1085,10 @@ $(OPENSSL_PATH)/crypto/ffc/ffc_params.c $(OPENSSL_PATH)/crypto/ffc/ffc_params_generate.c $(OPENSSL_PATH)/crypto/ffc/ffc_params_validate.c + $(OPENSSL_PATH)/crypto/hashtable/hashtable.c $(OPENSSL_PATH)/crypto/hmac/hmac.c + $(OPENSSL_PATH)/crypto/hpke/hpke.c + $(OPENSSL_PATH)/crypto/hpke/hpke_util.c $(OPENSSL_PATH)/crypto/http/http_client.c $(OPENSSL_PATH)/crypto/http/http_err.c $(OPENSSL_PATH)/crypto/http/http_lib.c @@ -1043,6 +1097,7 @@ $(OPENSSL_PATH)/crypto/lhash/lhash.c $(OPENSSL_PATH)/crypto/asn1_dsa.c $(OPENSSL_PATH)/crypto/bsearch.c + $(OPENSSL_PATH)/crypto/comp_methods.c $(OPENSSL_PATH)/crypto/context.c $(OPENSSL_PATH)/crypto/core_algorithm.c $(OPENSSL_PATH)/crypto/core_fetch.c @@ -1052,10 +1107,13 @@ $(OPENSSL_PATH)/crypto/cryptlib.c $(OPENSSL_PATH)/crypto/ctype.c $(OPENSSL_PATH)/crypto/cversion.c + $(OPENSSL_PATH)/crypto/defaults.c $(OPENSSL_PATH)/crypto/der_writer.c + $(OPENSSL_PATH)/crypto/deterministic_nonce.c $(OPENSSL_PATH)/crypto/ebcdic.c $(OPENSSL_PATH)/crypto/ex_data.c $(OPENSSL_PATH)/crypto/getenv.c + $(OPENSSL_PATH)/crypto/indicator_core.c $(OPENSSL_PATH)/crypto/info.c $(OPENSSL_PATH)/crypto/init.c $(OPENSSL_PATH)/crypto/initthread.c @@ -1078,12 +1136,15 @@ $(OPENSSL_PATH)/crypto/provider_conf.c $(OPENSSL_PATH)/crypto/provider_core.c $(OPENSSL_PATH)/crypto/punycode.c + $(OPENSSL_PATH)/crypto/quic_vlint.c $(OPENSSL_PATH)/crypto/self_test_core.c + $(OPENSSL_PATH)/crypto/sleep.c $(OPENSSL_PATH)/crypto/sparse_array.c $(OPENSSL_PATH)/crypto/threads_lib.c $(OPENSSL_PATH)/crypto/threads_none.c $(OPENSSL_PATH)/crypto/threads_pthread.c $(OPENSSL_PATH)/crypto/threads_win.c + $(OPENSSL_PATH)/crypto/time.c $(OPENSSL_PATH)/crypto/trace.c $(OPENSSL_PATH)/crypto/uid.c $(OPENSSL_PATH)/crypto/md5/md5_dgst.c @@ -1100,6 +1161,7 @@ $(OPENSSL_PATH)/crypto/modes/siv128.c $(OPENSSL_PATH)/crypto/modes/wrap128.c $(OPENSSL_PATH)/crypto/modes/xts128.c + $(OPENSSL_PATH)/crypto/modes/xts128gb.c $(OPENSSL_PATH)/crypto/objects/o_names.c $(OPENSSL_PATH)/crypto/objects/obj_dat.c $(OPENSSL_PATH)/crypto/objects/obj_err.c @@ -1136,6 +1198,7 @@ $(OPENSSL_PATH)/crypto/rand/rand_lib.c $(OPENSSL_PATH)/crypto/rand/rand_meth.c $(OPENSSL_PATH)/crypto/rand/rand_pool.c + $(OPENSSL_PATH)/crypto/rand/rand_uniform.c $(OPENSSL_PATH)/crypto/rsa/rsa_ameth.c $(OPENSSL_PATH)/crypto/rsa/rsa_asn1.c $(OPENSSL_PATH)/crypto/rsa/rsa_backend.c @@ -1169,6 +1232,8 @@ $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c $(OPENSSL_PATH)/crypto/sm3/sm3.c $(OPENSSL_PATH)/crypto/stack/stack.c + $(OPENSSL_PATH)/crypto/thread/arch/thread_win.c + $(OPENSSL_PATH)/crypto/thread/api.c $(OPENSSL_PATH)/crypto/txt_db/txt_db.c $(OPENSSL_PATH)/crypto/ui/ui_err.c $(OPENSSL_PATH)/crypto/ui/ui_lib.c @@ -1184,14 +1249,18 @@ $(OPENSSL_PATH)/crypto/x509/pcy_map.c $(OPENSSL_PATH)/crypto/x509/pcy_node.c $(OPENSSL_PATH)/crypto/x509/pcy_tree.c + $(OPENSSL_PATH)/crypto/x509/t_acert.c $(OPENSSL_PATH)/crypto/x509/t_crl.c $(OPENSSL_PATH)/crypto/x509/t_req.c $(OPENSSL_PATH)/crypto/x509/t_x509.c + $(OPENSSL_PATH)/crypto/x509/v3_ac_tgt.c $(OPENSSL_PATH)/crypto/x509/v3_addr.c $(OPENSSL_PATH)/crypto/x509/v3_admis.c $(OPENSSL_PATH)/crypto/x509/v3_akeya.c $(OPENSSL_PATH)/crypto/x509/v3_akid.c $(OPENSSL_PATH)/crypto/x509/v3_asid.c + $(OPENSSL_PATH)/crypto/x509/v3_audit_id.c + $(OPENSSL_PATH)/crypto/x509/v3_battcons.c $(OPENSSL_PATH)/crypto/x509/v3_bcons.c $(OPENSSL_PATH)/crypto/x509/v3_bitst.c $(OPENSSL_PATH)/crypto/x509/v3_conf.c @@ -1200,12 +1269,17 @@ $(OPENSSL_PATH)/crypto/x509/v3_enum.c $(OPENSSL_PATH)/crypto/x509/v3_extku.c $(OPENSSL_PATH)/crypto/x509/v3_genn.c + $(OPENSSL_PATH)/crypto/x509/v3_group_ac.c $(OPENSSL_PATH)/crypto/x509/v3_ia5.c + $(OPENSSL_PATH)/crypto/x509/v3_ind_iss.c $(OPENSSL_PATH)/crypto/x509/v3_info.c $(OPENSSL_PATH)/crypto/x509/v3_int.c + $(OPENSSL_PATH)/crypto/x509/v3_iobo.c $(OPENSSL_PATH)/crypto/x509/v3_ist.c $(OPENSSL_PATH)/crypto/x509/v3_lib.c $(OPENSSL_PATH)/crypto/x509/v3_ncons.c + $(OPENSSL_PATH)/crypto/x509/v3_no_ass.c + $(OPENSSL_PATH)/crypto/x509/v3_no_rev_avail.c $(OPENSSL_PATH)/crypto/x509/v3_pci.c $(OPENSSL_PATH)/crypto/x509/v3_pcia.c $(OPENSSL_PATH)/crypto/x509/v3_pcons.c @@ -1214,12 +1288,17 @@ $(OPENSSL_PATH)/crypto/x509/v3_prn.c $(OPENSSL_PATH)/crypto/x509/v3_purp.c $(OPENSSL_PATH)/crypto/x509/v3_san.c + $(OPENSSL_PATH)/crypto/x509/v3_sda.c + $(OPENSSL_PATH)/crypto/x509/v3_single_use.c $(OPENSSL_PATH)/crypto/x509/v3_skid.c + $(OPENSSL_PATH)/crypto/x509/v3_soa_id.c $(OPENSSL_PATH)/crypto/x509/v3_sxnet.c $(OPENSSL_PATH)/crypto/x509/v3_tlsf.c + $(OPENSSL_PATH)/crypto/x509/v3_usernotice.c $(OPENSSL_PATH)/crypto/x509/v3_utf8.c $(OPENSSL_PATH)/crypto/x509/v3_utl.c $(OPENSSL_PATH)/crypto/x509/v3err.c + $(OPENSSL_PATH)/crypto/x509/x509_acert.c $(OPENSSL_PATH)/crypto/x509/x509_att.c $(OPENSSL_PATH)/crypto/x509/x509_cmp.c $(OPENSSL_PATH)/crypto/x509/x509_d2.c @@ -1237,6 +1316,7 @@ $(OPENSSL_PATH)/crypto/x509/x509_v3.c $(OPENSSL_PATH)/crypto/x509/x509_vfy.c $(OPENSSL_PATH)/crypto/x509/x509_vpm.c + $(OPENSSL_PATH)/crypto/x509/x509aset.c $(OPENSSL_PATH)/crypto/x509/x509cset.c $(OPENSSL_PATH)/crypto/x509/x509name.c $(OPENSSL_PATH)/crypto/x509/x509rset.c @@ -1246,6 +1326,7 @@ $(OPENSSL_PATH)/crypto/x509/x_attrib.c $(OPENSSL_PATH)/crypto/x509/x_crl.c $(OPENSSL_PATH)/crypto/x509/x_exten.c + $(OPENSSL_PATH)/crypto/x509/x_ietfatt.c $(OPENSSL_PATH)/crypto/x509/x_name.c $(OPENSSL_PATH)/crypto/x509/x_pubkey.c $(OPENSSL_PATH)/crypto/x509/x_req.c @@ -1294,7 +1375,9 @@ $(OPENSSL_PATH)/providers/implementations/exchange/ecdh_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/ecx_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c + $(OPENSSL_PATH)/providers/implementations/kdfs/argon2.c $(OPENSSL_PATH)/providers/implementations/kdfs/hkdf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/hmacdrbg_kdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/kbkdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/krb5kdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/pbkdf2.c @@ -1305,6 +1388,9 @@ $(OPENSSL_PATH)/providers/implementations/kdfs/sskdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c $(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c + $(OPENSSL_PATH)/providers/implementations/kem/ec_kem.c + $(OPENSSL_PATH)/providers/implementations/kem/ecx_kem.c + $(OPENSSL_PATH)/providers/implementations/kem/kem_util.c $(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c $(OPENSSL_PATH)/providers/implementations/keymgmt/dh_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/ec_kmgmt.c @@ -1315,12 +1401,12 @@ $(OPENSSL_PATH)/providers/implementations/macs/gmac_prov.c $(OPENSSL_PATH)/providers/implementations/macs/hmac_prov.c $(OPENSSL_PATH)/providers/implementations/macs/kmac_prov.c - $(OPENSSL_PATH)/providers/implementations/rands/crngt.c $(OPENSSL_PATH)/providers/implementations/rands/drbg.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_ctr.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_hash.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_hmac.c $(OPENSSL_PATH)/providers/implementations/rands/seed_src.c + $(OPENSSL_PATH)/providers/implementations/rands/seed_src_jitter.c $(OPENSSL_PATH)/providers/implementations/rands/test_rng.c $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_cpu_x86.c $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_tsc.c @@ -1330,7 +1416,8 @@ $(OPENSSL_PATH)/providers/implementations/signature/eddsa_sig.c $(OPENSSL_PATH)/providers/implementations/signature/mac_legacy_sig.c $(OPENSSL_PATH)/providers/implementations/signature/rsa_sig.c - $(OPENSSL_PATH)/ssl/s3_cbc.c + $(OPENSSL_PATH)/ssl/record/methods/ssl3_cbc.c + $(OPENSSL_GEN_PATH)/crypto/params_idx.c $(OPENSSL_PATH)/providers/common/der/der_ec_key.c $(OPENSSL_PATH)/providers/common/der/der_ec_sig.c $(OPENSSL_PATH)/providers/common/der/der_ecx_key.c @@ -1345,7 +1432,7 @@ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_gcm_hw.c $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_hw.c $(OPENSSL_PATH)/providers/implementations/digests/digestcommon.c - $(OPENSSL_PATH)/ssl/record/tls_pad.c + $(OPENSSL_PATH)/ssl/record/methods/tls_pad.c $(OPENSSL_GEN_PATH)/providers/common/der/der_digests_gen.c $(OPENSSL_GEN_PATH)/providers/common/der/der_ec_gen.c $(OPENSSL_GEN_PATH)/providers/common/der/der_ecx_gen.c @@ -1362,6 +1449,7 @@ $(OPENSSL_PATH)/ssl/s3_msg.c $(OPENSSL_PATH)/ssl/ssl_asn1.c $(OPENSSL_PATH)/ssl/ssl_cert.c + $(OPENSSL_PATH)/ssl/ssl_cert_comp.c $(OPENSSL_PATH)/ssl/ssl_ciph.c $(OPENSSL_PATH)/ssl/ssl_conf.c $(OPENSSL_PATH)/ssl/ssl_err.c @@ -1381,12 +1469,16 @@ $(OPENSSL_PATH)/ssl/tls13_enc.c $(OPENSSL_PATH)/ssl/tls_depr.c $(OPENSSL_PATH)/ssl/tls_srp.c - $(OPENSSL_PATH)/ssl/record/dtls1_bitmap.c $(OPENSSL_PATH)/ssl/record/rec_layer_d1.c $(OPENSSL_PATH)/ssl/record/rec_layer_s3.c - $(OPENSSL_PATH)/ssl/record/ssl3_buffer.c - $(OPENSSL_PATH)/ssl/record/ssl3_record.c - $(OPENSSL_PATH)/ssl/record/ssl3_record_tls13.c + $(OPENSSL_PATH)/ssl/record/methods/dtls_meth.c + $(OPENSSL_PATH)/ssl/record/methods/ssl3_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls13_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls1_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls_common.c + $(OPENSSL_PATH)/ssl/record/methods/tls_multib.c + $(OPENSSL_PATH)/ssl/record/methods/tlsany_meth.c + $(OPENSSL_PATH)/ssl/rio/poll_immediate.c $(OPENSSL_PATH)/ssl/statem/extensions.c $(OPENSSL_PATH)/ssl/statem/extensions_clnt.c $(OPENSSL_PATH)/ssl/statem/extensions_cust.c @@ -1403,6 +1495,7 @@ $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/aes/vpaes-x86_64.nasm | MSFT $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/x86_64cpuid.nasm | MSFT $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/md5/md5-x86_64.nasm | MSFT + $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/modes/aes-gcm-avx512.nasm | MSFT $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/modes/aesni-gcm-x86_64.nasm | MSFT $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/modes/ghash-x86_64.nasm | MSFT $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/sha/keccak1600-x86_64.nasm | MSFT @@ -1420,6 +1513,7 @@ $(OPENSSL_GEN_PATH)/X64-GCC/crypto/aes/vpaes-x86_64.s | GCC $(OPENSSL_GEN_PATH)/X64-GCC/crypto/x86_64cpuid.s | GCC $(OPENSSL_GEN_PATH)/X64-GCC/crypto/md5/md5-x86_64.s | GCC + $(OPENSSL_GEN_PATH)/X64-GCC/crypto/modes/aes-gcm-avx512.s | GCC $(OPENSSL_GEN_PATH)/X64-GCC/crypto/modes/aesni-gcm-x86_64.s | GCC $(OPENSSL_GEN_PATH)/X64-GCC/crypto/modes/ghash-x86_64.s | GCC $(OPENSSL_GEN_PATH)/X64-GCC/crypto/sha/keccak1600-x86_64.s | GCC @@ -1530,6 +1624,7 @@ $(OPENSSL_PATH)/crypto/bio/bss_conn.c $(OPENSSL_PATH)/crypto/bio/bss_core.c $(OPENSSL_PATH)/crypto/bio/bss_dgram.c + $(OPENSSL_PATH)/crypto/bio/bss_dgram_pair.c $(OPENSSL_PATH)/crypto/bio/bss_fd.c $(OPENSSL_PATH)/crypto/bio/bss_file.c $(OPENSSL_PATH)/crypto/bio/bss_log.c @@ -1571,7 +1666,9 @@ $(OPENSSL_PATH)/crypto/bn/bn_x931p.c $(OPENSSL_PATH)/crypto/buffer/buf_err.c $(OPENSSL_PATH)/crypto/buffer/buffer.c + $(OPENSSL_PATH)/crypto/comp/c_brotli.c $(OPENSSL_PATH)/crypto/comp/c_zlib.c + $(OPENSSL_PATH)/crypto/comp/c_zstd.c $(OPENSSL_PATH)/crypto/comp/comp_err.c $(OPENSSL_PATH)/crypto/comp/comp_lib.c $(OPENSSL_PATH)/crypto/conf/conf_api.c @@ -1649,7 +1746,9 @@ $(OPENSSL_PATH)/crypto/err/err_all.c $(OPENSSL_PATH)/crypto/err/err_all_legacy.c $(OPENSSL_PATH)/crypto/err/err_blocks.c + $(OPENSSL_PATH)/crypto/err/err_mark.c $(OPENSSL_PATH)/crypto/err/err_prn.c + $(OPENSSL_PATH)/crypto/err/err_save.c $(OPENSSL_PATH)/crypto/ess/ess_asn1.c $(OPENSSL_PATH)/crypto/ess/ess_err.c $(OPENSSL_PATH)/crypto/ess/ess_lib.c @@ -1732,7 +1831,10 @@ $(OPENSSL_PATH)/crypto/ffc/ffc_params.c $(OPENSSL_PATH)/crypto/ffc/ffc_params_generate.c $(OPENSSL_PATH)/crypto/ffc/ffc_params_validate.c + $(OPENSSL_PATH)/crypto/hashtable/hashtable.c $(OPENSSL_PATH)/crypto/hmac/hmac.c + $(OPENSSL_PATH)/crypto/hpke/hpke.c + $(OPENSSL_PATH)/crypto/hpke/hpke_util.c $(OPENSSL_PATH)/crypto/http/http_client.c $(OPENSSL_PATH)/crypto/http/http_err.c $(OPENSSL_PATH)/crypto/http/http_lib.c @@ -1741,6 +1843,7 @@ $(OPENSSL_PATH)/crypto/lhash/lhash.c $(OPENSSL_PATH)/crypto/asn1_dsa.c $(OPENSSL_PATH)/crypto/bsearch.c + $(OPENSSL_PATH)/crypto/comp_methods.c $(OPENSSL_PATH)/crypto/context.c $(OPENSSL_PATH)/crypto/core_algorithm.c $(OPENSSL_PATH)/crypto/core_fetch.c @@ -1750,10 +1853,13 @@ $(OPENSSL_PATH)/crypto/cryptlib.c $(OPENSSL_PATH)/crypto/ctype.c $(OPENSSL_PATH)/crypto/cversion.c + $(OPENSSL_PATH)/crypto/defaults.c $(OPENSSL_PATH)/crypto/der_writer.c + $(OPENSSL_PATH)/crypto/deterministic_nonce.c $(OPENSSL_PATH)/crypto/ebcdic.c $(OPENSSL_PATH)/crypto/ex_data.c $(OPENSSL_PATH)/crypto/getenv.c + $(OPENSSL_PATH)/crypto/indicator_core.c $(OPENSSL_PATH)/crypto/info.c $(OPENSSL_PATH)/crypto/init.c $(OPENSSL_PATH)/crypto/initthread.c @@ -1776,12 +1882,15 @@ $(OPENSSL_PATH)/crypto/provider_conf.c $(OPENSSL_PATH)/crypto/provider_core.c $(OPENSSL_PATH)/crypto/punycode.c + $(OPENSSL_PATH)/crypto/quic_vlint.c $(OPENSSL_PATH)/crypto/self_test_core.c + $(OPENSSL_PATH)/crypto/sleep.c $(OPENSSL_PATH)/crypto/sparse_array.c $(OPENSSL_PATH)/crypto/threads_lib.c $(OPENSSL_PATH)/crypto/threads_none.c $(OPENSSL_PATH)/crypto/threads_pthread.c $(OPENSSL_PATH)/crypto/threads_win.c + $(OPENSSL_PATH)/crypto/time.c $(OPENSSL_PATH)/crypto/trace.c $(OPENSSL_PATH)/crypto/uid.c $(OPENSSL_PATH)/crypto/md5/md5_dgst.c @@ -1798,6 +1907,7 @@ $(OPENSSL_PATH)/crypto/modes/siv128.c $(OPENSSL_PATH)/crypto/modes/wrap128.c $(OPENSSL_PATH)/crypto/modes/xts128.c + $(OPENSSL_PATH)/crypto/modes/xts128gb.c $(OPENSSL_PATH)/crypto/objects/o_names.c $(OPENSSL_PATH)/crypto/objects/obj_dat.c $(OPENSSL_PATH)/crypto/objects/obj_err.c @@ -1834,6 +1944,7 @@ $(OPENSSL_PATH)/crypto/rand/rand_lib.c $(OPENSSL_PATH)/crypto/rand/rand_meth.c $(OPENSSL_PATH)/crypto/rand/rand_pool.c + $(OPENSSL_PATH)/crypto/rand/rand_uniform.c $(OPENSSL_PATH)/crypto/rsa/rsa_ameth.c $(OPENSSL_PATH)/crypto/rsa/rsa_asn1.c $(OPENSSL_PATH)/crypto/rsa/rsa_backend.c @@ -1867,6 +1978,8 @@ $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c $(OPENSSL_PATH)/crypto/sm3/sm3.c $(OPENSSL_PATH)/crypto/stack/stack.c + $(OPENSSL_PATH)/crypto/thread/arch/thread_win.c + $(OPENSSL_PATH)/crypto/thread/api.c $(OPENSSL_PATH)/crypto/txt_db/txt_db.c $(OPENSSL_PATH)/crypto/ui/ui_err.c $(OPENSSL_PATH)/crypto/ui/ui_lib.c @@ -1882,14 +1995,18 @@ $(OPENSSL_PATH)/crypto/x509/pcy_map.c $(OPENSSL_PATH)/crypto/x509/pcy_node.c $(OPENSSL_PATH)/crypto/x509/pcy_tree.c + $(OPENSSL_PATH)/crypto/x509/t_acert.c $(OPENSSL_PATH)/crypto/x509/t_crl.c $(OPENSSL_PATH)/crypto/x509/t_req.c $(OPENSSL_PATH)/crypto/x509/t_x509.c + $(OPENSSL_PATH)/crypto/x509/v3_ac_tgt.c $(OPENSSL_PATH)/crypto/x509/v3_addr.c $(OPENSSL_PATH)/crypto/x509/v3_admis.c $(OPENSSL_PATH)/crypto/x509/v3_akeya.c $(OPENSSL_PATH)/crypto/x509/v3_akid.c $(OPENSSL_PATH)/crypto/x509/v3_asid.c + $(OPENSSL_PATH)/crypto/x509/v3_audit_id.c + $(OPENSSL_PATH)/crypto/x509/v3_battcons.c $(OPENSSL_PATH)/crypto/x509/v3_bcons.c $(OPENSSL_PATH)/crypto/x509/v3_bitst.c $(OPENSSL_PATH)/crypto/x509/v3_conf.c @@ -1898,12 +2015,17 @@ $(OPENSSL_PATH)/crypto/x509/v3_enum.c $(OPENSSL_PATH)/crypto/x509/v3_extku.c $(OPENSSL_PATH)/crypto/x509/v3_genn.c + $(OPENSSL_PATH)/crypto/x509/v3_group_ac.c $(OPENSSL_PATH)/crypto/x509/v3_ia5.c + $(OPENSSL_PATH)/crypto/x509/v3_ind_iss.c $(OPENSSL_PATH)/crypto/x509/v3_info.c $(OPENSSL_PATH)/crypto/x509/v3_int.c + $(OPENSSL_PATH)/crypto/x509/v3_iobo.c $(OPENSSL_PATH)/crypto/x509/v3_ist.c $(OPENSSL_PATH)/crypto/x509/v3_lib.c $(OPENSSL_PATH)/crypto/x509/v3_ncons.c + $(OPENSSL_PATH)/crypto/x509/v3_no_ass.c + $(OPENSSL_PATH)/crypto/x509/v3_no_rev_avail.c $(OPENSSL_PATH)/crypto/x509/v3_pci.c $(OPENSSL_PATH)/crypto/x509/v3_pcia.c $(OPENSSL_PATH)/crypto/x509/v3_pcons.c @@ -1912,12 +2034,17 @@ $(OPENSSL_PATH)/crypto/x509/v3_prn.c $(OPENSSL_PATH)/crypto/x509/v3_purp.c $(OPENSSL_PATH)/crypto/x509/v3_san.c + $(OPENSSL_PATH)/crypto/x509/v3_sda.c + $(OPENSSL_PATH)/crypto/x509/v3_single_use.c $(OPENSSL_PATH)/crypto/x509/v3_skid.c + $(OPENSSL_PATH)/crypto/x509/v3_soa_id.c $(OPENSSL_PATH)/crypto/x509/v3_sxnet.c $(OPENSSL_PATH)/crypto/x509/v3_tlsf.c + $(OPENSSL_PATH)/crypto/x509/v3_usernotice.c $(OPENSSL_PATH)/crypto/x509/v3_utf8.c $(OPENSSL_PATH)/crypto/x509/v3_utl.c $(OPENSSL_PATH)/crypto/x509/v3err.c + $(OPENSSL_PATH)/crypto/x509/x509_acert.c $(OPENSSL_PATH)/crypto/x509/x509_att.c $(OPENSSL_PATH)/crypto/x509/x509_cmp.c $(OPENSSL_PATH)/crypto/x509/x509_d2.c @@ -1935,6 +2062,7 @@ $(OPENSSL_PATH)/crypto/x509/x509_v3.c $(OPENSSL_PATH)/crypto/x509/x509_vfy.c $(OPENSSL_PATH)/crypto/x509/x509_vpm.c + $(OPENSSL_PATH)/crypto/x509/x509aset.c $(OPENSSL_PATH)/crypto/x509/x509cset.c $(OPENSSL_PATH)/crypto/x509/x509name.c $(OPENSSL_PATH)/crypto/x509/x509rset.c @@ -1944,6 +2072,7 @@ $(OPENSSL_PATH)/crypto/x509/x_attrib.c $(OPENSSL_PATH)/crypto/x509/x_crl.c $(OPENSSL_PATH)/crypto/x509/x_exten.c + $(OPENSSL_PATH)/crypto/x509/x_ietfatt.c $(OPENSSL_PATH)/crypto/x509/x_name.c $(OPENSSL_PATH)/crypto/x509/x_pubkey.c $(OPENSSL_PATH)/crypto/x509/x_req.c @@ -1992,7 +2121,9 @@ $(OPENSSL_PATH)/providers/implementations/exchange/ecdh_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/ecx_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c + $(OPENSSL_PATH)/providers/implementations/kdfs/argon2.c $(OPENSSL_PATH)/providers/implementations/kdfs/hkdf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/hmacdrbg_kdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/kbkdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/krb5kdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/pbkdf2.c @@ -2003,6 +2134,9 @@ $(OPENSSL_PATH)/providers/implementations/kdfs/sskdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c $(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c + $(OPENSSL_PATH)/providers/implementations/kem/ec_kem.c + $(OPENSSL_PATH)/providers/implementations/kem/ecx_kem.c + $(OPENSSL_PATH)/providers/implementations/kem/kem_util.c $(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c $(OPENSSL_PATH)/providers/implementations/keymgmt/dh_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/ec_kmgmt.c @@ -2013,12 +2147,12 @@ $(OPENSSL_PATH)/providers/implementations/macs/gmac_prov.c $(OPENSSL_PATH)/providers/implementations/macs/hmac_prov.c $(OPENSSL_PATH)/providers/implementations/macs/kmac_prov.c - $(OPENSSL_PATH)/providers/implementations/rands/crngt.c $(OPENSSL_PATH)/providers/implementations/rands/drbg.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_ctr.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_hash.c $(OPENSSL_PATH)/providers/implementations/rands/drbg_hmac.c $(OPENSSL_PATH)/providers/implementations/rands/seed_src.c + $(OPENSSL_PATH)/providers/implementations/rands/seed_src_jitter.c $(OPENSSL_PATH)/providers/implementations/rands/test_rng.c $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_cpu_x86.c $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_tsc.c @@ -2028,7 +2162,8 @@ $(OPENSSL_PATH)/providers/implementations/signature/eddsa_sig.c $(OPENSSL_PATH)/providers/implementations/signature/mac_legacy_sig.c $(OPENSSL_PATH)/providers/implementations/signature/rsa_sig.c - $(OPENSSL_PATH)/ssl/s3_cbc.c + $(OPENSSL_PATH)/ssl/record/methods/ssl3_cbc.c + $(OPENSSL_GEN_PATH)/crypto/params_idx.c $(OPENSSL_PATH)/providers/common/der/der_ec_key.c $(OPENSSL_PATH)/providers/common/der/der_ec_sig.c $(OPENSSL_PATH)/providers/common/der/der_ecx_key.c @@ -2043,7 +2178,7 @@ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_gcm_hw.c $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_hw.c $(OPENSSL_PATH)/providers/implementations/digests/digestcommon.c - $(OPENSSL_PATH)/ssl/record/tls_pad.c + $(OPENSSL_PATH)/ssl/record/methods/tls_pad.c $(OPENSSL_GEN_PATH)/providers/common/der/der_digests_gen.c $(OPENSSL_GEN_PATH)/providers/common/der/der_ec_gen.c $(OPENSSL_GEN_PATH)/providers/common/der/der_ecx_gen.c @@ -2060,6 +2195,7 @@ $(OPENSSL_PATH)/ssl/s3_msg.c $(OPENSSL_PATH)/ssl/ssl_asn1.c $(OPENSSL_PATH)/ssl/ssl_cert.c + $(OPENSSL_PATH)/ssl/ssl_cert_comp.c $(OPENSSL_PATH)/ssl/ssl_ciph.c $(OPENSSL_PATH)/ssl/ssl_conf.c $(OPENSSL_PATH)/ssl/ssl_err.c @@ -2079,12 +2215,16 @@ $(OPENSSL_PATH)/ssl/tls13_enc.c $(OPENSSL_PATH)/ssl/tls_depr.c $(OPENSSL_PATH)/ssl/tls_srp.c - $(OPENSSL_PATH)/ssl/record/dtls1_bitmap.c $(OPENSSL_PATH)/ssl/record/rec_layer_d1.c $(OPENSSL_PATH)/ssl/record/rec_layer_s3.c - $(OPENSSL_PATH)/ssl/record/ssl3_buffer.c - $(OPENSSL_PATH)/ssl/record/ssl3_record.c - $(OPENSSL_PATH)/ssl/record/ssl3_record_tls13.c + $(OPENSSL_PATH)/ssl/record/methods/dtls_meth.c + $(OPENSSL_PATH)/ssl/record/methods/ssl3_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls13_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls1_meth.c + $(OPENSSL_PATH)/ssl/record/methods/tls_common.c + $(OPENSSL_PATH)/ssl/record/methods/tls_multib.c + $(OPENSSL_PATH)/ssl/record/methods/tlsany_meth.c + $(OPENSSL_PATH)/ssl/rio/poll_immediate.c $(OPENSSL_PATH)/ssl/statem/extensions.c $(OPENSSL_PATH)/ssl/statem/extensions_clnt.c $(OPENSSL_PATH)/ssl/statem/extensions_cust.c @@ -2093,14 +2233,18 @@ $(OPENSSL_PATH)/ssl/statem/statem_dtls.c $(OPENSSL_PATH)/ssl/statem/statem_lib.c $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/aes/aesv8-armx.S | GCC + $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/aes/bsaes-armv8.S | GCC $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/aes/vpaes-armv8.S | GCC $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/arm64cpuid.S | GCC + $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/md5/md5-aarch64.S | GCC + $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/modes/aes-gcm-armv8-unroll8_64.S | GCC $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/modes/aes-gcm-armv8_64.S | GCC $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/modes/ghashv8-armx.S | GCC $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/sha/keccak1600-armv8.S | GCC $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/sha/sha1-armv8.S | GCC $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/sha/sha256-armv8.S | GCC $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/sha/sha512-armv8.S | GCC + $(OPENSSL_GEN_PATH)/AARCH64-GCC/crypto/sm3/sm3-armv8.S | GCC # Autogenerated files list ends here [Packages] @@ -2131,16 +2275,16 @@ # C4819: The file contains a character that cannot be represented in the current code page # C4133: incompatible types - from 'ASN1_TYPE *' to 'const ASN1_STRING *' (v3_genn.c(101)) # - MSFT:*_*_IA32_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_IA32) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4310 /wd4389 /wd4700 /wd4702 /wd4706 /wd4819 /wd4133 - MSFT:*_*_X64_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_X64) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4306 /wd4310 /wd4700 /wd4389 /wd4702 /wd4706 /wd4819 /wd4133 + MSFT:*_*_IA32_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_IA32) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4310 /wd4389 /wd4700 /wd4702 /wd4706 /wd4819 /wd4133 /wd4701 /wd4703 /wd4189 + MSFT:*_*_X64_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_X64) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4306 /wd4310 /wd4700 /wd4389 /wd4702 /wd4706 /wd4819 /wd4133 /wd4701 /wd4703 /wd4189 # # Disable following Visual Studio 2015 compiler warnings brought by openssl source, # so we do not break the build with /WX option: # C4718: recursive call has no side effects, deleting # - MSFT:*_VS2015x86_IA32_CC_FLAGS = /wd4718 - MSFT:*_VS2015x86_X64_CC_FLAGS = /wd4718 + MSFT:*_VS2015x86_IA32_CC_FLAGS = /wd4718 /wd4701 /wd4703 /wd4189 + MSFT:*_VS2015x86_X64_CC_FLAGS = /wd4718 /wd4701 /wd4703 /wd4189 INTEL:*_*_IA32_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER -U__ICC $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_IA32) /w INTEL:*_*_X64_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER -U__ICC $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_X64) /w diff --git a/CryptoPkg/Library/OpensslLib/OpensslStub/SslExtServNull.c b/CryptoPkg/Library/OpensslLib/OpensslStub/SslExtServNull.c index e3b3aa26ec12..edd8d556a5b7 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslStub/SslExtServNull.c +++ b/CryptoPkg/Library/OpensslLib/OpensslStub/SslExtServNull.c @@ -13,11 +13,11 @@ int tls_parse_ctos_renegotiate ( - SSL *s, - PACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + PACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return -1; @@ -25,11 +25,11 @@ tls_parse_ctos_renegotiate ( int tls_parse_ctos_server_name ( - SSL *s, - PACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + PACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return 0; @@ -37,11 +37,11 @@ tls_parse_ctos_server_name ( int tls_parse_ctos_maxfragmentlen ( - SSL *s, - PACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + PACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return 0; @@ -50,11 +50,11 @@ tls_parse_ctos_maxfragmentlen ( #ifndef OPENSSL_NO_SRP int tls_parse_ctos_srp ( - SSL *s, - PACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + PACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return 0; @@ -64,11 +64,11 @@ tls_parse_ctos_srp ( int tls_parse_ctos_ec_pt_formats ( - SSL *s, - PACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + PACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return 0; @@ -76,11 +76,11 @@ tls_parse_ctos_ec_pt_formats ( int tls_parse_ctos_session_ticket ( - SSL *s, - PACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + PACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return 0; @@ -88,7 +88,7 @@ tls_parse_ctos_session_ticket ( int tls_parse_ctos_sig_algs_cert ( - SSL *s, + SSL_CONNECTION *s, PACKET *pkt, ossl_unused unsigned int context, ossl_unused X509 *x, @@ -100,11 +100,11 @@ tls_parse_ctos_sig_algs_cert ( int tls_parse_ctos_sig_algs ( - SSL *s, - PACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + PACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return 0; @@ -113,11 +113,11 @@ tls_parse_ctos_sig_algs ( #ifndef OPENSSL_NO_OCSP int tls_parse_ctos_status_request ( - SSL *s, - PACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + PACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return 0; @@ -128,11 +128,11 @@ tls_parse_ctos_status_request ( #ifndef OPENSSL_NO_NEXTPROTONEG int tls_parse_ctos_npn ( - SSL *s, - PACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + PACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return 0; @@ -146,11 +146,11 @@ tls_parse_ctos_npn ( */ int tls_parse_ctos_alpn ( - SSL *s, - PACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + PACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return 0; @@ -159,11 +159,11 @@ tls_parse_ctos_alpn ( #ifndef OPENSSL_NO_SRTP int tls_parse_ctos_use_srtp ( - SSL *s, - PACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + PACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return 0; @@ -173,11 +173,11 @@ tls_parse_ctos_use_srtp ( int tls_parse_ctos_etm ( - SSL *s, - PACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + PACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return 0; @@ -189,11 +189,11 @@ tls_parse_ctos_etm ( */ int tls_parse_ctos_psk_kex_modes ( - SSL *s, - PACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + PACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return 0; @@ -205,11 +205,11 @@ tls_parse_ctos_psk_kex_modes ( */ int tls_parse_ctos_key_share ( - SSL *s, - PACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + PACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return 0; @@ -217,11 +217,11 @@ tls_parse_ctos_key_share ( int tls_parse_ctos_cookie ( - SSL *s, - PACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + PACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return 0; @@ -229,11 +229,11 @@ tls_parse_ctos_cookie ( int tls_parse_ctos_supported_groups ( - SSL *s, - PACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + PACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return 0; @@ -241,11 +241,11 @@ tls_parse_ctos_supported_groups ( int tls_parse_ctos_ems ( - SSL *s, - PACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + PACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return 0; @@ -253,11 +253,11 @@ tls_parse_ctos_ems ( int tls_parse_ctos_early_data ( - SSL *s, - PACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + PACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return 0; @@ -265,11 +265,11 @@ tls_parse_ctos_early_data ( int tls_parse_ctos_psk ( - SSL *s, - PACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + PACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return 0; @@ -277,7 +277,7 @@ tls_parse_ctos_psk ( int tls_parse_ctos_post_handshake_auth ( - SSL *s, + SSL_CONNECTION *s, PACKET *pkt, ossl_unused unsigned int context, ossl_unused X509 *x, @@ -292,11 +292,11 @@ tls_parse_ctos_post_handshake_auth ( */ EXT_RETURN tls_construct_stoc_renegotiate ( - SSL *s, - WPACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + WPACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return EXT_RETURN_FAIL; @@ -304,11 +304,11 @@ tls_construct_stoc_renegotiate ( EXT_RETURN tls_construct_stoc_server_name ( - SSL *s, - WPACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + WPACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return EXT_RETURN_FAIL; @@ -317,11 +317,11 @@ tls_construct_stoc_server_name ( /* Add/include the server's max fragment len extension into ServerHello */ EXT_RETURN tls_construct_stoc_maxfragmentlen ( - SSL *s, - WPACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + WPACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return EXT_RETURN_FAIL; @@ -329,11 +329,11 @@ tls_construct_stoc_maxfragmentlen ( EXT_RETURN tls_construct_stoc_ec_pt_formats ( - SSL *s, - WPACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + WPACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return EXT_RETURN_FAIL; @@ -341,11 +341,11 @@ tls_construct_stoc_ec_pt_formats ( EXT_RETURN tls_construct_stoc_supported_groups ( - SSL *s, - WPACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + WPACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return EXT_RETURN_FAIL; @@ -353,11 +353,11 @@ tls_construct_stoc_supported_groups ( EXT_RETURN tls_construct_stoc_session_ticket ( - SSL *s, - WPACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + WPACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return EXT_RETURN_FAIL; @@ -366,11 +366,11 @@ tls_construct_stoc_session_ticket ( #ifndef OPENSSL_NO_OCSP EXT_RETURN tls_construct_stoc_status_request ( - SSL *s, - WPACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + WPACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return EXT_RETURN_FAIL; @@ -381,11 +381,11 @@ tls_construct_stoc_status_request ( #ifndef OPENSSL_NO_NEXTPROTONEG EXT_RETURN tls_construct_stoc_next_proto_neg ( - SSL *s, - WPACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + WPACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return EXT_RETURN_FAIL; @@ -395,11 +395,11 @@ tls_construct_stoc_next_proto_neg ( EXT_RETURN tls_construct_stoc_alpn ( - SSL *s, - WPACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + WPACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return EXT_RETURN_FAIL; @@ -408,11 +408,11 @@ tls_construct_stoc_alpn ( #ifndef OPENSSL_NO_SRTP EXT_RETURN tls_construct_stoc_use_srtp ( - SSL *s, - WPACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + WPACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return EXT_RETURN_FAIL; @@ -422,11 +422,11 @@ tls_construct_stoc_use_srtp ( EXT_RETURN tls_construct_stoc_etm ( - SSL *s, - WPACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + WPACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return EXT_RETURN_FAIL; @@ -434,11 +434,11 @@ tls_construct_stoc_etm ( EXT_RETURN tls_construct_stoc_ems ( - SSL *s, - WPACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + WPACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return EXT_RETURN_FAIL; @@ -446,11 +446,11 @@ tls_construct_stoc_ems ( EXT_RETURN tls_construct_stoc_supported_versions ( - SSL *s, - WPACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + WPACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return EXT_RETURN_FAIL; @@ -458,11 +458,11 @@ tls_construct_stoc_supported_versions ( EXT_RETURN tls_construct_stoc_key_share ( - SSL *s, - WPACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + WPACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return EXT_RETURN_FAIL; @@ -470,11 +470,11 @@ tls_construct_stoc_key_share ( EXT_RETURN tls_construct_stoc_cookie ( - SSL *s, - WPACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + WPACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return EXT_RETURN_FAIL; @@ -482,11 +482,11 @@ tls_construct_stoc_cookie ( EXT_RETURN tls_construct_stoc_cryptopro_bug ( - SSL *s, - WPACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + WPACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return EXT_RETURN_FAIL; @@ -494,11 +494,11 @@ tls_construct_stoc_cryptopro_bug ( EXT_RETURN tls_construct_stoc_early_data ( - SSL *s, - WPACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + WPACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return EXT_RETURN_FAIL; @@ -506,12 +506,60 @@ tls_construct_stoc_early_data ( EXT_RETURN tls_construct_stoc_psk ( - SSL *s, - WPACKET *pkt, - unsigned int context, - X509 *x, - size_t chainidx + SSL_CONNECTION *s, + WPACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx ) { return EXT_RETURN_FAIL; } + +EXT_RETURN +tls_construct_stoc_client_cert_type ( + SSL_CONNECTION *sc, + WPACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx + ) +{ + return EXT_RETURN_FAIL; +} + +int +tls_parse_ctos_client_cert_type ( + SSL_CONNECTION *sc, + PACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx + ) +{ + return 0; +} + +EXT_RETURN +tls_construct_stoc_server_cert_type ( + SSL_CONNECTION *sc, + WPACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx + ) +{ + return EXT_RETURN_FAIL; +} + +int +tls_parse_ctos_server_cert_type ( + SSL_CONNECTION *sc, + PACKET *pkt, + unsigned int context, + X509 *x, + size_t chainidx + ) +{ + return 0; +} diff --git a/CryptoPkg/Library/OpensslLib/OpensslStub/SslStatServNull.c b/CryptoPkg/Library/OpensslLib/OpensslStub/SslStatServNull.c index 878f9e1a0b11..82a95f50d512 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslStub/SslStatServNull.c +++ b/CryptoPkg/Library/OpensslLib/OpensslStub/SslStatServNull.c @@ -15,8 +15,8 @@ int ossl_statem_server_read_transition ( - SSL *s, - int mt + SSL_CONNECTION *s, + int mt ) { return 0; @@ -31,7 +31,7 @@ ossl_statem_server_read_transition ( */ int send_certificate_request ( - SSL *s + SSL_CONNECTION *s ) { return 0; @@ -43,7 +43,7 @@ send_certificate_request ( */ WRITE_TRAN ossl_statem_server_write_transition ( - SSL *s + SSL_CONNECTION *s ) { return WRITE_TRAN_ERROR; @@ -51,8 +51,8 @@ ossl_statem_server_write_transition ( WORK_STATE ossl_statem_server_pre_work ( - SSL *s, - WORK_STATE wst + SSL_CONNECTION *s, + WORK_STATE wst ) { return WORK_ERROR; @@ -64,8 +64,8 @@ ossl_statem_server_pre_work ( */ WORK_STATE ossl_statem_server_post_work ( - SSL *s, - WORK_STATE wst + SSL_CONNECTION *s, + WORK_STATE wst ) { return WORK_ERROR; @@ -81,10 +81,9 @@ ossl_statem_server_post_work ( */ int ossl_statem_server_construct_message ( - SSL *s, - WPACKET *pkt, - confunc_f *confunc, - int *mt + SSL_CONNECTION *s, + confunc_f *confunc, + int *mt ) { return 0; @@ -96,7 +95,7 @@ ossl_statem_server_construct_message ( */ size_t ossl_statem_server_max_message_size ( - SSL *s + SSL_CONNECTION *s ) { return 0; @@ -107,8 +106,8 @@ ossl_statem_server_max_message_size ( */ MSG_PROCESS_RETURN ossl_statem_server_process_message ( - SSL *s, - PACKET *pkt + SSL_CONNECTION *s, + PACKET *pkt ) { return MSG_PROCESS_ERROR; @@ -120,8 +119,8 @@ ossl_statem_server_process_message ( */ WORK_STATE ossl_statem_server_post_process_message ( - SSL *s, - WORK_STATE wst + SSL_CONNECTION *s, + WORK_STATE wst ) { return WORK_ERROR; @@ -137,10 +136,10 @@ dtls_raw_hello_verify_request ( return 0; } -int +CON_FUNC_RETURN dtls_construct_hello_verify_request ( - SSL *s, - WPACKET *pkt + SSL_CONNECTION *s, + WPACKET *pkt ) { return 0; @@ -148,8 +147,8 @@ dtls_construct_hello_verify_request ( MSG_PROCESS_RETURN tls_process_client_hello ( - SSL *s, - PACKET *pkt + SSL_CONNECTION *s, + PACKET *pkt ) { return MSG_PROCESS_ERROR; @@ -161,7 +160,7 @@ tls_process_client_hello ( */ int tls_handle_alpn ( - SSL *s + SSL_CONNECTION *s ) { return 0; @@ -169,44 +168,44 @@ tls_handle_alpn ( WORK_STATE tls_post_process_client_hello ( - SSL *s, - WORK_STATE wst + SSL_CONNECTION *s, + WORK_STATE wst ) { return WORK_ERROR; } -int +CON_FUNC_RETURN tls_construct_server_hello ( - SSL *s, - WPACKET *pkt + SSL_CONNECTION *s, + WPACKET *pkt ) { return 0; } -int +CON_FUNC_RETURN tls_construct_server_done ( - SSL *s, - WPACKET *pkt + SSL_CONNECTION *s, + WPACKET *pkt ) { return 0; } -int +CON_FUNC_RETURN tls_construct_server_key_exchange ( - SSL *s, - WPACKET *pkt + SSL_CONNECTION *s, + WPACKET *pkt ) { return 0; } -int +CON_FUNC_RETURN tls_construct_certificate_request ( - SSL *s, - WPACKET *pkt + SSL_CONNECTION *s, + WPACKET *pkt ) { return 0; @@ -214,8 +213,8 @@ tls_construct_certificate_request ( MSG_PROCESS_RETURN tls_process_client_key_exchange ( - SSL *s, - PACKET *pkt + SSL_CONNECTION *s, + PACKET *pkt ) { return MSG_PROCESS_ERROR; @@ -223,8 +222,8 @@ tls_process_client_key_exchange ( WORK_STATE tls_post_process_client_key_exchange ( - SSL *s, - WORK_STATE wst + SSL_CONNECTION *s, + WORK_STATE wst ) { return WORK_ERROR; @@ -232,26 +231,26 @@ tls_post_process_client_key_exchange ( MSG_PROCESS_RETURN tls_process_client_certificate ( - SSL *s, - PACKET *pkt + SSL_CONNECTION *s, + PACKET *pkt ) { return MSG_PROCESS_ERROR; } -int +CON_FUNC_RETURN tls_construct_server_certificate ( - SSL *s, - WPACKET *pkt + SSL_CONNECTION *s, + WPACKET *pkt ) { return 0; } -int +CON_FUNC_RETURN tls_construct_new_session_ticket ( - SSL *s, - WPACKET *pkt + SSL_CONNECTION *s, + WPACKET *pkt ) { return 0; @@ -263,17 +262,17 @@ tls_construct_new_session_ticket ( */ int tls_construct_cert_status_body ( - SSL *s, - WPACKET *pkt + SSL_CONNECTION *s, + WPACKET *pkt ) { return 0; } -int +CON_FUNC_RETURN tls_construct_cert_status ( - SSL *s, - WPACKET *pkt + SSL_CONNECTION *s, + WPACKET *pkt ) { return 0; @@ -287,8 +286,8 @@ tls_construct_cert_status ( */ MSG_PROCESS_RETURN tls_process_next_proto ( - SSL *s, - PACKET *pkt + SSL_CONNECTION *s, + PACKET *pkt ) { return MSG_PROCESS_ERROR; @@ -298,8 +297,8 @@ tls_process_next_proto ( MSG_PROCESS_RETURN tls_process_end_of_early_data ( - SSL *s, - PACKET *pkt + SSL_CONNECTION *s, + PACKET *pkt ) { return MSG_PROCESS_ERROR; diff --git a/CryptoPkg/Library/OpensslLib/openssl b/CryptoPkg/Library/OpensslLib/openssl index c523121f902f..98acb6b02839 160000 --- a/CryptoPkg/Library/OpensslLib/openssl +++ b/CryptoPkg/Library/OpensslLib/openssl @@ -1 +1 @@ -Subproject commit c523121f902fde2929909dc7f76b13ceb4961efe +Subproject commit 98acb6b02839c609ef5b837794e08d906d965335